def run_builtin_aggregators(builtin_aggregates, df, ctx, spark): agg_cols = [] for r in builtin_aggregates: aggregator = ctx.aggregators[r["aggregator"]] f_name = extract_spark_name(aggregator["name"]) agg_func = getattr(F, f_name) col_name_list = [] columns_dict = r["inputs"]["columns"] if "col" in columns_dict.keys(): col_name_list.append(columns_dict["col"]) if "cols" in columns_dict.keys(): col_name_list += columns_dict["cols"] if "col1" in columns_dict.keys() and "col2" in columns_dict.keys(): col_name_list.append(columns_dict["col1"]) col_name_list.append(columns_dict["col2"]) if len(col_name_list) == 0: raise CortexException( "input columns not found in aggregator: {}".format(r)) args = {} if r["inputs"].get("args", None) is not None and len(r["inputs"]["args"]) > 0: args = ctx.populate_args(r["inputs"]["args"]) col_list = [F.col(c) for c in col_name_list] agg_cols.append(agg_func(*col_list, **args).alias(r["name"])) results = df.agg(*agg_cols).collect()[0].asDict() for r in builtin_aggregates: ctx.store_aggregate_result(results[r["name"]], r) return results
def download_file(self, key, local_path): try: util.mkdir_p(os.path.dirname(local_path)) self.s3.download_file(self.bucket, key, local_path) return local_path except Exception as e: raise CortexException("bucket " + self.bucket, "key " + key) from e
def download_file_from_s3(key, local_path, bucket, client_config={}): try: util.mkdir_p(os.path.dirname(local_path)) s3 = s3_client(client_config) s3.download_file(bucket, key, local_path) return local_path except Exception as e: raise CortexException("bucket " + bucket, "key " + key) from e
def _read_bytes_from_s3(self, key, allow_missing=False): try: byte_array = self.s3.get_object(Bucket=self.bucket, Key=key)["Body"].read() except self.s3.exceptions.NoSuchKey as e: if allow_missing: return None raise CortexException("bucket " + self.bucket, "key " + key) from e return byte_array.strip()
def download_file(self, key, local_path): util.mkdir_p(os.path.dirname(local_path)) try: self.s3.download_file(self.bucket, key, local_path) return local_path except Exception as e: raise CortexException( 'key "{}" in bucket "{}" could not be accessed; '.format( key, bucket) + "it may not exist, or you may not have suffienct permissions" ) from e
def create_inputs_map(values_map, input_config): inputs = {} for input_name, input_config_item in input_config.items(): if util.is_str(input_config_item): inputs[input_name] = values_map[input_config_item] elif util.is_int(input_config_item): inputs[input_name] = values_map[input_config_item] elif util.is_list(input_config_item): inputs[input_name] = [values_map[f] for f in input_config_item] else: raise CortexException("invalid column inputs") return inputs
def get_training_data_parts(self, model_name, mode, part_prefix="part"): training_dataset = self.models[model_name]["dataset"] if mode == "training": data_key = training_dataset["train_key"] elif mode == "evaluation": data_key = training_dataset["eval_key"] else: raise CortexException( "unrecognized training/evaluation mode {} must be one of (train_key, eval_key)" .format(mode)) training_data_parts_prefix = os.path.join(data_key, part_prefix) return self.storage.search(prefix=training_data_parts_prefix)
def create_inputs_from_features_map(features_values_map, feature_input_config): inputs = {} for input_name, input_config_item in feature_input_config.items(): if util.is_str(input_config_item): inputs[input_name] = features_values_map[input_config_item] elif util.is_int(input_config_item): inputs[input_name] = features_values_map[input_config_item] elif util.is_list(input_config_item): inputs[input_name] = [ features_values_map[f] for f in input_config_item ] else: raise CortexException("invalid feature inputs") return inputs
def _deserialize_raw_ctx(raw_ctx): if raw_ctx.get("environment") is not None: raw_columns = raw_ctx["raw_columns"] raw_ctx["raw_columns"] = util.merge_dicts_overwrite(*raw_columns.values()) data_split = raw_ctx["environment_data"] if data_split["csv_data"] is not None and data_split["parquet_data"] is None: raw_ctx["environment"]["data"] = data_split["csv_data"] elif data_split["parquet_data"] is not None and data_split["csv_data"] is None: raw_ctx["environment"]["data"] = data_split["parquet_data"] else: raise CortexException("expected csv_data or parquet_data but found " + data_split) return raw_ctx
def _deserialize_raw_ctx(raw_ctx): raw_columns = raw_ctx["raw_columns"] raw_ctx["raw_columns"] = util.merge_dicts_overwrite( raw_columns["raw_int_columns"], raw_columns["raw_float_columns"], raw_columns["raw_string_columns"], ) data_split = raw_ctx["environment_data"] if data_split["csv_data"] is not None and data_split["parquet_data"] is None: raw_ctx["environment"]["data"] = data_split["csv_data"] elif data_split["parquet_data"] is not None and data_split["csv_data"] is None: raw_ctx["environment"]["data"] = data_split["parquet_data"] else: raise CortexException("expected csv_data or parquet_data but found " + data_split) return raw_ctx
def _read_bytes_from_s3(self, key, allow_missing=False, ext_bucket=None): bucket = self.bucket if ext_bucket is not None: bucket = ext_bucket try: try: byte_array = self.s3.get_object(Bucket=bucket, Key=key)["Body"].read() except self.s3.exceptions.NoSuchKey as e: if allow_missing: return None raise e except Exception as e: raise CortexException( 'key "{}" in bucket "{}" could not be accessed; '.format( key, bucket) + "it may not exist, or you may not have suffienct permissions" ) from e return byte_array.strip()
def download_file(self, key, local_path): try: Path(local_path).parent.mkdir(parents=True, exist_ok=True) shutil.copy(str(self._get_path(key)), local_path) except Exception as e: raise CortexException("file not found", key) from e