def get_feature_vals_by_cand_split(pickled_obj, fk_ltable_idx, fk_rtable_idx, l_df, r_df, candsplit, show_progress): feature_table = cloudpickle.loads(pickled_obj) if show_progress: prog_bar = pyprind.ProgBar(len(candsplit)) l_dict = {} r_dict = {} feat_vals = [] for row in candsplit.itertuples(index=False): if show_progress: prog_bar.update() fk_ltable_val = row[fk_ltable_idx] fk_rtable_val = row[fk_rtable_idx] if fk_ltable_val not in l_dict: l_dict[fk_ltable_val] = l_df.ix[fk_ltable_val] l_tuple = l_dict[fk_ltable_val] if fk_rtable_val not in r_dict: r_dict[fk_rtable_val] = r_df.ix[fk_rtable_val] r_tuple = r_dict[fk_rtable_val] f = apply_feat_fns(l_tuple, r_tuple, feature_table) feat_vals.append(f) return feat_vals
def _local_execute_func(exec_func, write_func, pickle_func, python_path): table_env = BatchTableEnvironment.create( environment_settings=EnvironmentSettings.new_instance( ).use_blink_planner().in_batch_mode().build()) table_env.get_config().get_configuration().set_string( 'parallelism.default', '1') table_env.get_config().set_python_executable(python_path) table_env.register_function( exec_func, udf(lambda _: pickle_func, DataTypes.BIGINT(), DataTypes.STRING())) table_env.connect(FileSystem().path(write_func)) \ .with_format(OldCsv().field('func', DataTypes.STRING())) \ .with_schema(Schema().field('func', DataTypes.STRING())) \ .create_temporary_table(exec_func) table = table_env.from_elements([(1, 'Joblib')]) table.select('{}(_1)'.format(exec_func)).insert_into(exec_func) table_env.execute(exec_func) # decode execution result from table sink file. execute_result = cloudpickle.loads( codecs.decode( pd.DataFrame(pd.read_csv(write_func))[0:].columns[0].encode(), 'base64')) # remove table sink file to clear ineffective files. os.remove(write_func) return execute_result
def _remote_execute_func(exec_func, write_func, exec_dict, jm, py): func_stdout = '{}/exec_{}_stdout.log'.format(get_file_dir(__file__), exec_func) func_stderr = '{}/exec_{}_stderr.log'.format(get_file_dir(__file__), exec_func) with open(func_stdout, 'a') as out, open(func_stderr, 'a') as err: # execute `flink run -m <remote> -py function.py` to submit batch job submitted_process = Popen( args= "{}/bin/flink run -m {} -py {}/exec_function.py -pyexec {} {} {} '{}'" .format(_find_flink_home(), jm, get_file_dir(__file__), py, exec_func, write_func, json.dumps(exec_dict)), shell=True, stdout=out, stderr=err) submitted_process.wait() # decode execution result from table sink file. execute_result = cloudpickle.loads( codecs.decode( pd.DataFrame( pd.read_csv(write_func))['func'].values[0].encode(), 'base64')) # remove table sink file to clear ineffective files. os.remove(write_func) return execute_result
def loads_fn(fn_bytes: bytes) -> Callable: fn_bytes_hash = hash(fn_bytes) try: fn = _fn_load_cache[fn_bytes_hash] except KeyError: fn = cloudpickle.loads(fn_bytes) _fn_load_cache[fn_bytes_hash] = fn return fn
def deserialize(code): data = cloudpickle.loads(base64.b64decode(code)) return data
def deserialize(code): # todo: add better error handling for <h1>Internal Server Error</h1> data = cloudpickle.loads(base64.b64decode(code)) return data
def deserialize(code): data = cloudpickle.loads(base64.b64decode(code)) return data['thunk'], data['args'] or (), data['kwargs'] or {}