예제 #1
0
    def run(self):

        spark_app = SparkBaseApp()
        spark = spark_app.spark
        hdfs = spark_app.hdfs

        hdfs_save_path = Path(config.HDFS_FEATURE_DIR)

        if hdfs.exists(hdfs_save_path / "_SUCCESS"):
            logging.info("Exists data found. {}".format(hdfs_save_path))
            logging.info("Data is ready! Skip prepare...")
            return

        train_data_path, test_data_path = get_data()
        if config.IRIS_DATASET_FOR == "train":
            local_data_path = train_data_path
        else:
            local_data_path = test_data_path

        pdf = pd.read_csv(local_data_path, header=None)
        pdf.columns = data_header
        pdf["idx"] = [i for i in range(1, len(pdf) + 1)]

        df = spark.createDataFrame(pdf)

        logging.info("Saving Dataset to {}".format(hdfs_save_path))
        df.repartition(1).write.save(hdfs_save_path.as_posix())
예제 #2
0
    def __init__(self):
        super(SparkNormalizer, self).__init__()

        self._normalizer_map = None
        self.default_method = "minmax"

        self._methods = {
            "minmax": MinMaxScaler,
            "zscore": StandardScaler,
            "pnorm": Normalizer
        }

        self._models = {
            "minmax": MinMaxScalerModel,
            "zscore": StandardScalerModel,
            "pnorm": Normalizer
        }

        self._params = {
            "minmax": {"min": 0.0, "max": 1.0},
            "zscore": {"withMean": True, "withStd": True},
            "pnorm": {"p": 2.0}
        }
        self._spark_app = SparkBaseApp()
        self._sc = self._spark_app.sc
예제 #3
0
파일: root.py 프로젝트: zhenglm/dlflow
    def run(self):
        def solve_local_path(raw_path):
            path = Path(raw_path).resolve()
            if not path.is_dir():
                path.mkdir(parents=True)

            return path.as_posix()

        config.LOCAL_WORKSPACE = solve_local_path(config.LOCAL_WORKSPACE)
        config.LOCAL_MODEL_DIR = solve_local_path(config.LOCAL_MODEL_DIR)
        config.LOCAL_FEMODEL_DIR = solve_local_path(config.LOCAL_FEMODEL_DIR)
        config.LOCAL_TMP_DIR = solve_local_path(config.LOCAL_TMP_DIR)

        def seq_conf_parser(v, sign=","):
            if isinstance(v, str):
                iter_v = v.split(sign)
            elif isinstance(v, list):
                iter_v = v
            else:
                iter_v = []

            return [i for i in map(lambda x: x.strip(), iter_v) if i]

        config.PRIMARY_KEYS = seq_conf_parser(config.PRIMARY_KEYS)
        config.LABELS = seq_conf_parser(config.LABELS)
        config.DROP_COLUMNS = seq_conf_parser(config.DROP_COLUMNS)

        if "SPARK" in config:
            app_name = ".".join(["DLFlow", config.uuid])
            spark_conf = config.SPARK.dense_dict if config.SPARK else {}
            spark_app = SparkBaseApp()
            spark_app.initialize_spark(app_name, spark_conf)

        else:
            hdfs = HDFS()
            hdfs.initialize_hdfs()
예제 #4
0
 def __init__(self):
     super(SparkFeParser, self).__init__()
     self._spark_app = SparkBaseApp()
     self._spark = self._spark_app.spark
예제 #5
0
파일: predict.py 프로젝트: zhenglm/dlflow
    def run(self):
        spark_app = SparkBaseApp()
        spark = spark_app.spark
        sc = spark_app.sc
        hdfs = spark_app.hdfs

        dirs = config._build_dirs
        tmp_fmap_dir = dirs["tmp_fmap_dir"]
        hdfs_ckpt_dir = dirs["hdfs_ckpt_dir"]
        hdfs_static_dir = dirs["hdfs_static_dir"]

        sc.addFile(hdfs.hdfs_whole_path(hdfs_static_dir.as_posix()),
                   recursive=True)
        sc.addFile(hdfs.hdfs_whole_path(hdfs_ckpt_dir.as_posix()),
                   recursive=True)

        fmap = Fmap.load(tmp_fmap_dir)

        bc_static_model_dir = sc.broadcast("static")
        bc_fmap = sc.broadcast(fmap)
        bc_config = sc.broadcast(config)

        def predict_map(rdd):
            from pyspark.files import SparkFiles

            config = bc_config.value
            fmap = bc_fmap.value
            static_dir = SparkFiles.get(bc_static_model_dir.value)
            ckpt_dir = SparkFiles.get("ckpt")

            from dlflow.mgr import Collector, model
            collect = Collector()
            collect(static_dir, "Static models")

            input_cls = model[config.MODEL.input_name]
            dataset = input_cls(fmap).rdd_inputs(rdd, config.MODEL.batch_size)

            model_cls = model[config.MODEL.model_name]
            model_ins = model_cls(fmap)
            model_ins.load_model(ckpt_dir)

            return model_ins.predict_act(dataset)

        local_model = model[config.MODEL.model_name](fmap)
        df_title = local_model.pkey_names
        df_title.extend(local_model.output_names)

        df = spark.read.parquet(config.HDFS_ENCODE_DIR)

        parallelism = spark.conf.get("spark.default.parallelism", None)
        if parallelism:
            num_partitions = int(parallelism)
        else:
            num_partitions = df.rdd.getNumPartitions()

        pred_df = df.repartition(num_partitions) \
                    .rdd \
                    .mapPartitions(predict_map) \
                    .toDF(df_title)

        hdfs_predict_dir = config.HDFS_PREDICT_DIR
        spark_app.save_compress(pred_df, hdfs_predict_dir)

        logging.info(
            i18n("Predicting result save to {}").format(hdfs_predict_dir))
        logging.info(i18n("Predicting Done."))
예제 #6
0
파일: workflow.py 프로젝트: zhenglm/dlflow
    def close(self):
        from dlflow.utils.sparkapp import SparkBaseApp, HDFS

        SparkBaseApp().close()
        HDFS().close()
예제 #7
0
    def run(self):
        spark_app = SparkBaseApp()
        spark = spark_app.spark
        hdfs = spark_app.hdfs

        if "HDFS_TFRECORD_DIR" in config:
            hdsf_tfrecord_dir = Path(config.HDFS_TFRECORD_DIR)
            if hdfs.exists(hdsf_tfrecord_dir / "_SUCCESS"):
                logging.info(i18n("TFRecords already exists, skip encoding."))
                return

        elif "HDFS_ENCODE_DIR" in config:
            hdfs_encode_dir = Path(config.HDFS_ENCODE_DIR)
            if hdfs.exists(hdfs_encode_dir / "_SUCCESS"):
                logging.info(
                    i18n("Encode result already exists, encoding done."))
                return

        fmap_dir = "fmap_{}".format(config.uuid)
        tmp_fmap_dir = Path(config.LOCAL_TMP_DIR).joinpath(fmap_dir)

        local_fmap_dir = Path(config.LOCAL_FEMODEL_DIR).joinpath("fmap")
        local_norm_dir = Path(config.LOCAL_FEMODEL_DIR).joinpath("norm")

        hdfs_fmap_dir = Path(config.HDFS_FEMODEL_DIR).joinpath("fmap")
        hdfs_norm_dir = Path(config.HDFS_FEMODEL_DIR).joinpath("norm")

        if not hdfs.exists(config.HDFS_FEMODEL_DIR):
            hdfs.mkdirs(hdfs_fmap_dir.parent)

        spark_parser = Parser("spark")
        parser_cls = spark_parser.get_parser()
        normalizer_cls = spark_parser.get_normalizer()

        if "SPARK.spark.default.parallelism" in config:
            parallelism = int(config.SPARK.spark.default.parallelism)
        else:
            parallelism = _DEFAULT_PARALLELISM

        df = spark.read \
                  .parquet(config.HDFS_FEATURE_DIR) \
                  .repartition(parallelism)

        parser = parser_cls()

        if hdfs.exists(hdfs_fmap_dir.joinpath("fmap.meta")):
            logging.info(i18n("Using HDFS fmap: {}").format(hdfs_fmap_dir))
            hdfs.get(hdfs_fmap_dir, tmp_fmap_dir)

        else:
            logging.info(
                i18n("There is no fmap available, start to "
                     "generate fmap by parsing features."))

            primary_keys = config.PRIMARY_KEYS
            labels = config.LABELS
            drop_columns = config.DROP_COLUMNS
            buckets = None if config.BUCKET is None else config.BUCKET.dict

            parser.fit(df,
                       buckets=buckets,
                       drop_columns=drop_columns,
                       primary_keys=primary_keys,
                       labels=labels)
            parser.save(tmp_fmap_dir)

            logging.info(i18n("Put fmap to HDFS: {}").format(hdfs_fmap_dir))
            hdfs.delete(hdfs_fmap_dir)
            hdfs.put(tmp_fmap_dir, hdfs_fmap_dir)

            if local_fmap_dir.exists():
                logging.warning(
                    i18n("Local directory {} already exists, "
                         "it will be overwritten: {}").format(
                             "fmap", local_fmap_dir))
                shutil.rmtree(local_fmap_dir)
            shutil.copytree(tmp_fmap_dir, local_fmap_dir)

        parser.load(tmp_fmap_dir)
        encode_df = parser.transform(df)

        normalizer = normalizer_cls()

        if hdfs.exists(
                hdfs_norm_dir.joinpath("normalizers_metadata", "_SUCCESS")):
            normalizer.load(hdfs_norm_dir)

        else:
            hdfs.mkdirs(hdfs_norm_dir)

            try:
                bucket_conf = config.BUCKET.dict
            except AttributeError:
                bucket_conf = None
                if config.BUCKET is not None:
                    logging.error(i18n("Get wrong type bucket configuration."))

            normalizer.fit(encode_df, parser.fmap, bucket_conf=bucket_conf)
            normalizer.save(hdfs_norm_dir)

            if local_norm_dir.exists():
                logging.warning(
                    i18n("Local directory {} already exists, "
                         "it will be overwritten: {}").format(
                             "norm", local_norm_dir))
                shutil.rmtree(local_norm_dir)

            hdfs.get(hdfs_norm_dir, local_norm_dir)

        norm_df = normalizer.transform(encode_df)

        spark_app.save_compress(norm_df, config.HDFS_ENCODE_DIR, use_tfr=False)

        if "HDFS_TFRECORD_DIR" in config:
            spark_app.save_compress(norm_df,
                                    config.HDFS_TFRECORD_DIR,
                                    use_tfr=True)

        logging.info(i18n("Encoding done."))