class _Root(TaskNode): parent_tag = TaskNode.set_tag() output_tag = TaskNode.set_tag(UNIVERSAL_TAG, "_START", "_ROOT") cfg = config.setting(config.req("MODEL_TAG"), config.req("MODEL_DATE"), config.req("HDFS_WORKSPACE"), config.req("PRIMARY_KEYS"), config.req("FEATURE_TAG"), config.req("FEATURE_DATE"), config.opt("HDFS_FEATURE_DIR", _HDFS_FEATURE_DIR), config.opt("HDFS_MODEL_DIR", _HDFS_MODEL_DIR), config.opt("HDFS_FEMODEL_DIR", _HDFS_FEMODEL_DIR), config.opt("LOCAL_WORKSPACE", "./dlflow_default"), config.opt("LOCAL_MODEL_DIR", _LOCAL_MODEL_DIR), config.opt("LOCAL_FEMODEL_DIR", _LOCAL_FEMODEL_DIR), config.opt("LOCAL_TMP_DIR", _LOCAL_TMP_DIR), config.opt("DROP_COLUMNS", [])) def __init__(self): super(_Root, self).__init__() @TaskNode.timeit def run(self): def solve_local_path(raw_path): path = Path(raw_path).resolve() if not path.is_dir(): path.mkdir(parents=True) return path.as_posix() config.LOCAL_WORKSPACE = solve_local_path(config.LOCAL_WORKSPACE) config.LOCAL_MODEL_DIR = solve_local_path(config.LOCAL_MODEL_DIR) config.LOCAL_FEMODEL_DIR = solve_local_path(config.LOCAL_FEMODEL_DIR) config.LOCAL_TMP_DIR = solve_local_path(config.LOCAL_TMP_DIR) def seq_conf_parser(v, sign=","): if isinstance(v, str): iter_v = v.split(sign) elif isinstance(v, list): iter_v = v else: iter_v = [] return [i for i in map(lambda x: x.strip(), iter_v) if i] config.PRIMARY_KEYS = seq_conf_parser(config.PRIMARY_KEYS) config.LABELS = seq_conf_parser(config.LABELS) config.DROP_COLUMNS = seq_conf_parser(config.DROP_COLUMNS) if "SPARK" in config: app_name = ".".join(["DLFlow", config.uuid]) spark_conf = config.SPARK.dense_dict if config.SPARK else {} spark_app = SparkBaseApp() spark_app.initialize_spark(app_name, spark_conf) else: hdfs = HDFS() hdfs.initialize_hdfs()
class MyModel(ModelBase): cfg = config.setting(config.req("MODEL.learning_rate"), config.req("MODEL.classes"), config.req("MODEL.layers"), config.opt("MODEL.batch_size", 8)) def __init__(self, fmap): super(MyModel, self).__init__(fmap) self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.001) self.compute_loss = tf.keras.losses.SparseCategoricalCrossentropy() self.mean_loss = tf.keras.metrics.Mean() self.acc = tf.keras.metrics.SparseCategoricalAccuracy() self.metrics = {"mean_loss": self.mean_loss, "acc": self.acc} self.msg_frac = 10 def build(self): concat_list = self.get_inputs(tp="nums") images = tf.concat(concat_list, axis=1) images = tf.reshape(images, (-1, 28, 28, 1)) output = CNN(n_class=10)(images) arg_max = tf.argmax(output, axis=1) self.set_output(output, "softmax") self.set_output(arg_max, "argmax") @tf.function def train(self, feature, label): _label = label["label"] with tf.GradientTape() as tape: output, _ = self.model(feature) loss = self.compute_loss(_label, output) grads = tape.gradient(loss, self.model.trainable_variables) self.optimizer.apply_gradients( zip(grads, self.model.trainable_variables)) self.mean_loss(loss) self.acc(_label, output) @tf.function def evaluate(self, feature, label): _label = label["label"] output, _ = self.model(feature) loss = self.compute_loss(_label, output) self.mean_loss(loss) self.acc(_label, output) @tf.function def predict(self, feature): pred = self.model(feature) return pred
class DataPrepare(TaskNode): parent_tag = TaskNode.set_tag("_START") output_tag = TaskNode.set_tag("RAW_FEATURE") cfg = config.setting( config.req("SPARK"), config.req("IRIS_DATASET_FOR"), config.opt("DATA_DIR", "$LOCAL_WORKSPACE/data_dir"), config.opt("HDFS_FEATURE_DIR", _HDFS_FEATURE_DIR), ) def __init__(self): super(DataPrepare, self).__init__() @TaskNode.timeit def run(self): spark_app = SparkBaseApp() spark = spark_app.spark hdfs = spark_app.hdfs hdfs_save_path = Path(config.HDFS_FEATURE_DIR) if hdfs.exists(hdfs_save_path / "_SUCCESS"): logging.info("Exists data found. {}".format(hdfs_save_path)) logging.info("Data is ready! Skip prepare...") return train_data_path, test_data_path = get_data() if config.IRIS_DATASET_FOR == "train": local_data_path = train_data_path else: local_data_path = test_data_path pdf = pd.read_csv(local_data_path, header=None) pdf.columns = data_header pdf["idx"] = [i for i in range(1, len(pdf) + 1)] df = spark.createDataFrame(pdf) logging.info("Saving Dataset to {}".format(hdfs_save_path)) df.repartition(1).write.save(hdfs_save_path.as_posix())
class DemoInput(InputBase): cfg = config.setting(config.opt("DemoParam", "DemoDefaultValue")) def __init__(self, fmap): super(DemoInput, self).__init__(fmap) def tfr_inputs(self, files): ... def rdd_inputs(self, rdd): ...
class DemoModel(ModelBase): cfg = config.setting(config.opt("DemoParam", "DemoDefaultValue")) def __init__(self, fmap): super(DemoModel, self).__init__(fmap) def build(self): ... def train(self, feature, label): ... def evaluate(self, feature, label): ... def predict(self, feature): ...
class _Predict(TaskNode): parent_tag = TaskNode.set_tag("_BUILD", "TRAINED_MODEL", "ENCODE_FEATURE") output_tag = TaskNode.set_tag("PREDICT_RESULT") bind_tasks = "_Build" cfg = config.setting( config.req("SPARK"), config.opt("HDFS_PREDICT_DIR", _HDFS_PREDICT_DIR), ) def __init__(self): super(_Predict, self).__init__() @TaskNode.timeit def run(self): spark_app = SparkBaseApp() spark = spark_app.spark sc = spark_app.sc hdfs = spark_app.hdfs dirs = config._build_dirs tmp_fmap_dir = dirs["tmp_fmap_dir"] hdfs_ckpt_dir = dirs["hdfs_ckpt_dir"] hdfs_static_dir = dirs["hdfs_static_dir"] sc.addFile(hdfs.hdfs_whole_path(hdfs_static_dir.as_posix()), recursive=True) sc.addFile(hdfs.hdfs_whole_path(hdfs_ckpt_dir.as_posix()), recursive=True) fmap = Fmap.load(tmp_fmap_dir) bc_static_model_dir = sc.broadcast("static") bc_fmap = sc.broadcast(fmap) bc_config = sc.broadcast(config) def predict_map(rdd): from pyspark.files import SparkFiles config = bc_config.value fmap = bc_fmap.value static_dir = SparkFiles.get(bc_static_model_dir.value) ckpt_dir = SparkFiles.get("ckpt") from dlflow.mgr import Collector, model collect = Collector() collect(static_dir, "Static models") input_cls = model[config.MODEL.input_name] dataset = input_cls(fmap).rdd_inputs(rdd, config.MODEL.batch_size) model_cls = model[config.MODEL.model_name] model_ins = model_cls(fmap) model_ins.load_model(ckpt_dir) return model_ins.predict_act(dataset) local_model = model[config.MODEL.model_name](fmap) df_title = local_model.pkey_names df_title.extend(local_model.output_names) df = spark.read.parquet(config.HDFS_ENCODE_DIR) parallelism = spark.conf.get("spark.default.parallelism", None) if parallelism: num_partitions = int(parallelism) else: num_partitions = df.rdd.getNumPartitions() pred_df = df.repartition(num_partitions) \ .rdd \ .mapPartitions(predict_map) \ .toDF(df_title) hdfs_predict_dir = config.HDFS_PREDICT_DIR spark_app.save_compress(pred_df, hdfs_predict_dir) logging.info( i18n("Predicting result save to {}").format(hdfs_predict_dir)) logging.info(i18n("Predicting Done."))
class DNNRegression(ModelBase): cfg = config.setting(config.req("MODEL.learning_rate"), config.req("MODEL.layers"), config.opt("MODEL.batch_size", 128)) def __init__(self, fmap): super(DNNRegression, self).__init__(fmap) self.optimizer = tf.keras.optimizers.Adam( learning_rate=config.MODEL.learning_rate) self.compute_loss = tf.keras.losses.MeanSquaredError() self.mean_loss = tf.keras.metrics.Mean() self.metrics = { "mean_loss": self.mean_loss, } self.msg_frac = 100 def build(self): concat_list = self.get_inputs(tp="nums") for ctg_inp, depth in self.get_inputs(tp="ctgs", with_depth=True): _emb = _Embedding(depth, 4)(ctg_inp) concat_list.append(_emb) net = tf.concat(concat_list, axis=1) for size in config.MODEL.layers: net = tf.keras.layers.Dense(size, activation=tf.nn.relu)(net) output = tf.keras.layers.Dense(1)(net) sigmoid = tf.nn.sigmoid(output) self.set_output(output, "output") self.set_output(sigmoid, "sigmoid") @tf.function def train(self, feature, label): _label = label["label"] with tf.GradientTape() as tape: logits, _ = self.model(feature) loss = self.compute_loss(_label, logits) grads = tape.gradient(loss, self.model.trainable_variables) self.optimizer.apply_gradients( zip(grads, self.model.trainable_variables)) self.mean_loss(loss) @tf.function def evaluate(self, feature, label): _label = label["label"] logits, _ = self.model(feature) loss = self.compute_loss(_label, logits) self.mean_loss(loss) @tf.function def predict(self, feature): pred = self.model(feature) return pred
class _Encode(TaskNode): parent_tag = TaskNode.set_tag("RAW_FEATURE") output_tag = TaskNode.set_tag("ENCODE_FEATURE", "TFRECORD_FEATURE") cfg = config.setting( config.req("SPARK"), config.req("BUCKET", None), config.opt("HDFS_ENCODE_DIR", _HDFS_ENCODE_DIR), ) def __init__(self): super(_Encode, self).__init__() @TaskNode.timeit def run(self): spark_app = SparkBaseApp() spark = spark_app.spark hdfs = spark_app.hdfs if "HDFS_TFRECORD_DIR" in config: hdsf_tfrecord_dir = Path(config.HDFS_TFRECORD_DIR) if hdfs.exists(hdsf_tfrecord_dir / "_SUCCESS"): logging.info(i18n("TFRecords already exists, skip encoding.")) return elif "HDFS_ENCODE_DIR" in config: hdfs_encode_dir = Path(config.HDFS_ENCODE_DIR) if hdfs.exists(hdfs_encode_dir / "_SUCCESS"): logging.info( i18n("Encode result already exists, encoding done.")) return fmap_dir = "fmap_{}".format(config.uuid) tmp_fmap_dir = Path(config.LOCAL_TMP_DIR).joinpath(fmap_dir) local_fmap_dir = Path(config.LOCAL_FEMODEL_DIR).joinpath("fmap") local_norm_dir = Path(config.LOCAL_FEMODEL_DIR).joinpath("norm") hdfs_fmap_dir = Path(config.HDFS_FEMODEL_DIR).joinpath("fmap") hdfs_norm_dir = Path(config.HDFS_FEMODEL_DIR).joinpath("norm") if not hdfs.exists(config.HDFS_FEMODEL_DIR): hdfs.mkdirs(hdfs_fmap_dir.parent) spark_parser = Parser("spark") parser_cls = spark_parser.get_parser() normalizer_cls = spark_parser.get_normalizer() if "SPARK.spark.default.parallelism" in config: parallelism = int(config.SPARK.spark.default.parallelism) else: parallelism = _DEFAULT_PARALLELISM df = spark.read \ .parquet(config.HDFS_FEATURE_DIR) \ .repartition(parallelism) parser = parser_cls() if hdfs.exists(hdfs_fmap_dir.joinpath("fmap.meta")): logging.info(i18n("Using HDFS fmap: {}").format(hdfs_fmap_dir)) hdfs.get(hdfs_fmap_dir, tmp_fmap_dir) else: logging.info( i18n("There is no fmap available, start to " "generate fmap by parsing features.")) primary_keys = config.PRIMARY_KEYS labels = config.LABELS drop_columns = config.DROP_COLUMNS buckets = None if config.BUCKET is None else config.BUCKET.dict parser.fit(df, buckets=buckets, drop_columns=drop_columns, primary_keys=primary_keys, labels=labels) parser.save(tmp_fmap_dir) logging.info(i18n("Put fmap to HDFS: {}").format(hdfs_fmap_dir)) hdfs.delete(hdfs_fmap_dir) hdfs.put(tmp_fmap_dir, hdfs_fmap_dir) if local_fmap_dir.exists(): logging.warning( i18n("Local directory {} already exists, " "it will be overwritten: {}").format( "fmap", local_fmap_dir)) shutil.rmtree(local_fmap_dir) shutil.copytree(tmp_fmap_dir, local_fmap_dir) parser.load(tmp_fmap_dir) encode_df = parser.transform(df) normalizer = normalizer_cls() if hdfs.exists( hdfs_norm_dir.joinpath("normalizers_metadata", "_SUCCESS")): normalizer.load(hdfs_norm_dir) else: hdfs.mkdirs(hdfs_norm_dir) try: bucket_conf = config.BUCKET.dict except AttributeError: bucket_conf = None if config.BUCKET is not None: logging.error(i18n("Get wrong type bucket configuration.")) normalizer.fit(encode_df, parser.fmap, bucket_conf=bucket_conf) normalizer.save(hdfs_norm_dir) if local_norm_dir.exists(): logging.warning( i18n("Local directory {} already exists, " "it will be overwritten: {}").format( "norm", local_norm_dir)) shutil.rmtree(local_norm_dir) hdfs.get(hdfs_norm_dir, local_norm_dir) norm_df = normalizer.transform(encode_df) spark_app.save_compress(norm_df, config.HDFS_ENCODE_DIR, use_tfr=False) if "HDFS_TFRECORD_DIR" in config: spark_app.save_compress(norm_df, config.HDFS_TFRECORD_DIR, use_tfr=True) logging.info(i18n("Encoding done."))
class NNDenseInput(InputBase): cfg = config.setting( config.opt("MODEL.epochs", 1), config.opt("MODEL.batch_size", 1), config.opt("MODEL.parallel", 4), config.opt("MODEL.shuffle_size", None) ) def __init__(self, fmap): super(NNDenseInput, self).__init__(fmap) def tfr_inputs(self, files): """ For train and evaluate. """ feature_dict = OrderedDict() for fe in self.fmap.primary_keys.get_features(): feature_dict[fe.name] = self._TF_FEATURE[fe.fetype]([1]) for fe in self.fmap.labels.get_features(): feature_dict[fe.name] = self._TF_FEATURE[fe.fetype]([1]) buckets = self.fmap.get_buckets(drop=PRESET_BUCKETS) for bucket in buckets: nums_size = bucket.nums.fe_size ctgs_size = bucket.ctgs.fe_count if nums_size > 0: name = "_".join([bucket.name, "nums"]) feature_dict[name] = self._float_feature([nums_size]) if ctgs_size > 0: name = "_".join([bucket.name, "ctgs"]) feature_dict[name] = self._int_feature([ctgs_size]) def _parse_single_example(example): feature = tf.io.parse_single_example(example, feature_dict) return feature parallel = config.MODEL.parallel dataset = tf.data \ .TFRecordDataset(files, num_parallel_reads=parallel) \ .map(_parse_single_example, num_parallel_calls=parallel) \ .batch(config.MODEL.batch_size) \ .repeat(config.MODEL.epochs) if config.MODEL.shuffle_size: dataset = dataset.shuffle(config.MODEL.shuffle_size) return dataset def rdd_inputs(self, rdd, batch_size): """ For spark predict. """ primary_keys = [] features = [] out_dtype = [] out_shape = [] for fe in self.fmap.primary_keys.get_features(): primary_keys.append(fe.name) out_dtype.append(self._TF_TYPE[fe.fetype]) out_shape.append(tf.TensorShape([fe.size])) buckets = self.fmap.get_buckets(drop=PRESET_BUCKETS) for bucket in buckets: nums_size = bucket.nums.fe_size ctgs_size = bucket.ctgs.fe_count if nums_size > 0: name = "_".join([bucket.name, "nums"]) features.append(name) out_dtype.append(tf.float32) out_shape.append(tf.TensorShape(nums_size)) if ctgs_size > 0: name = "_".join([bucket.name, "ctgs"]) features.append(name) out_dtype.append(tf.int64) out_shape.append(tf.TensorShape(ctgs_size)) def rdd_generator(): for row in rdd: row_data = [] for k in primary_keys: row_data.append([row[k]]) for k in features: row_data.append(list(row[k])) yield tuple(row_data) dataset = tf.data.Dataset \ .from_generator(generator=rdd_generator, output_shapes=tuple(out_shape), output_types=tuple(out_dtype)) \ .batch(batch_size) return dataset
class _Build(TaskNode): parent_tag = TaskNode.set_tag("ENCODE_FEATURE", "TFRECORD_FEATURE") output_tag = TaskNode.set_tag("_BUILD") cfg = config.setting( config.req("MODELS_DIR"), config.req("MODEL.model_name"), config.req("MODEL.input_name"), config.opt("HDFS_ENCODE_DIR", _HDFS_ENCODE_DIR), config.opt("HDFS_TFRECORD_DIR", _HDFS_TFRECORD_DIR), ) def __init__(self): super(_Build, self).__init__() @TaskNode.timeit def run(self): hdfs = HDFS() local_static_dir = Path(config.MODELS_DIR).resolve() hdfs_static_dir = Path(config.HDFS_MODEL_DIR).joinpath("static") fmap_dir = "fmap_{}".format(config.uuid) tmp_fmap_dir = Path(config.LOCAL_TMP_DIR).joinpath(fmap_dir) local_fmap_dir = Path(config.LOCAL_FEMODEL_DIR).joinpath("fmap") hdfs_fmap_dir = Path(config.HDFS_FEMODEL_DIR).joinpath("fmap") ckpt_dir = "ckpt_{}".format(config.uuid) tmp_ckpt_dir = Path(config.LOCAL_TMP_DIR).joinpath(ckpt_dir) local_ckpt_link = Path(config.LOCAL_MODEL_DIR).joinpath("ckpt") local_ckpt_dir = Path(config.LOCAL_MODEL_DIR).joinpath(ckpt_dir) hdfs_ckpt_dir = Path(config.HDFS_MODEL_DIR).joinpath("ckpt") if hdfs.exists(hdfs_fmap_dir.joinpath("fmap.meta")): logging.info( i18n("Using HDFS fmap: {}").format(hdfs_fmap_dir)) hdfs.get(hdfs_fmap_dir, tmp_fmap_dir) elif local_fmap_dir.joinpath("fmap.meta").exists(): logging.info( i18n("Using local fmap: {}").format(local_fmap_dir)) shutil.copytree(local_fmap_dir, tmp_fmap_dir) else: raise FmapNotExists( i18n("No available fmap found, pleas " "process feature encoding first.")) if hdfs.exists(hdfs_ckpt_dir): logging.info( i18n("Getting ckpt from HDFS: {}").format(hdfs_ckpt_dir)) hdfs.get(hdfs_ckpt_dir, tmp_ckpt_dir) elif local_ckpt_link.resolve().exists(): logging.info( i18n("Using local ckpt: {}").format(local_ckpt_link)) shutil.copytree(local_ckpt_link.resolve(), tmp_ckpt_dir) else: logging.info( i18n("No available ckpt found, reinitializing...")) config._build_dirs = { "fmap_dir": fmap_dir, "ckpt_dir": ckpt_dir, "hdfs_fmap_dir": hdfs_fmap_dir, "hdfs_ckpt_dir": hdfs_ckpt_dir, "hdfs_static_dir": hdfs_static_dir, "tmp_fmap_dir": tmp_fmap_dir, "tmp_ckpt_dir": tmp_ckpt_dir, "local_fmap_dir": local_fmap_dir, "local_ckpt_dir": local_ckpt_dir, "local_static_dir": local_static_dir, "local_ckpt_link": local_ckpt_link }
class DNNBinaryClassifier(ModelBase): cfg = config.setting(config.req("MODEL.layers"), config.opt("MODEL.learning_rate", 0.001), config.opt("MODEL.batch_size", 128)) def __init__(self, fmap): super(DNNBinaryClassifier, self).__init__(fmap) self.optimizer = tf.keras.optimizers.Adam( learning_rate=config.MODEL.learning_rate) self.compute_loss = tf.keras.losses.BinaryCrossentropy( from_logits=True) self.mean_loss = tf.keras.metrics.Mean() self.acc = tf.keras.metrics.BinaryAccuracy() self.auc = tf.keras.metrics.AUC() self.metrics = { "mean_loss": self.mean_loss, "acc": self.acc, "auc": self.auc } def build(self): concat_list = self.get_inputs(tp="nums") for ctg_inp, depth in self.get_inputs(tp="ctgs", with_depth=True): _emb = _Embedding(depth, 6)(ctg_inp) concat_list.append(_emb) net = tf.concat(concat_list, axis=1) for size in config.MODEL.layers: net = tf.keras.layers.Dense(size, activation=tf.nn.relu)(net) logits = tf.keras.layers.Dense(1)(net) sigmoid = tf.nn.sigmoid(logits) self.set_output(logits, "logits") self.set_output(sigmoid, "sigmoid") @tf.function def train(self, feature, label): _label = label["label"] with tf.GradientTape() as tape: logits, sigmoid = self.model(feature) loss = self.compute_loss(_label, logits) grads = tape.gradient(loss, self.model.trainable_variables) self.optimizer.apply_gradients( zip(grads, self.model.trainable_variables)) self.mean_loss(loss) self.acc(_label, sigmoid) self.auc(_label, sigmoid) @tf.function def evaluate(self, feature, label): _label = label["label"] logits, sigmoid = self.model(feature) loss = self.compute_loss(_label, logits) self.mean_loss(loss) self.acc(_label, sigmoid) self.auc(_label, sigmoid) @tf.function def predict(self, feature): pred = self.model(feature) return pred
class MyModel(ModelBase): cfg = config.setting(config.req("MODEL.learning_rate"), config.req("MODEL.classes"), config.req("MODEL.layers"), config.opt("MODEL.batch_size", 8)) def __init__(self, fmap): super(MyModel, self).__init__(fmap) self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.001) self.compute_loss = tf.keras.losses.SparseCategoricalCrossentropy() self.mean_loss = tf.keras.metrics.Mean() self.acc = tf.keras.metrics.SparseCategoricalAccuracy() self.metrics = {"mean_loss": self.mean_loss, "acc": self.acc} self.msg_frac = 10 def build(self): concat_list = self.get_inputs(tp="nums") for ctg_inp, depth in self.get_inputs(tp="ctgs", with_depth=True): _emb = _Embedding(depth, 6)(ctg_inp) concat_list.append(_emb) net = tf.concat(concat_list, axis=1) for size in config.MODEL.layers: net = tf.keras.layers.Dense(size, activation=tf.nn.relu)(net) output = tf.keras.layers.Dense(config.MODEL.classes, activation=tf.nn.softmax)(net) arg_max = tf.argmax(output, axis=1) self.set_output(output, "softmax") self.set_output(arg_max, "argmax") @tf.function def train(self, feature, label): _label = label["species"] with tf.GradientTape() as tape: output, _ = self.model(feature) loss = self.compute_loss(_label, output) grads = tape.gradient(loss, self.model.trainable_variables) self.optimizer.apply_gradients( zip(grads, self.model.trainable_variables)) self.mean_loss(loss) self.acc(_label, output) @tf.function def evaluate(self, feature, label): _label = label["species"] output, _ = self.model(feature) loss = self.compute_loss(_label, output) self.mean_loss(loss) self.acc(_label, output) @tf.function def predict(self, feature): pred = self.model(feature) return pred