def __init__(self, master="local[*]", app_name="optimus", checkpoint=False, path=None, file_system="local", verbose=False, dl=False): """ Transform and roll out :param master: 'Master', 'local' or ip address to a cluster :param app_name: Spark app name :param path: path to the checkpoint folder :param checkpoint: If True create a checkpoint folder :param file_system: 'local' or 'hadoop' """ if verbose is True: logging.basicConfig(format="%(message)s", level=logging.INFO) elif verbose is False: logging.propagate = False logging.disable(logging.NOTSET) if dl is True: Optimus.add_spark_packages([ "databricks:spark-deep-learning:1.1.0-spark2.3-s_2.11 pyspark-shell" ]) Spark.instance = Spark(master, app_name) from optimus.dl.models import DL self.dl = DL() else: Spark.instance = Spark(master, app_name) pass if path is None: path = os.getcwd() # Initialize Spark logging.info(""" ____ __ _ / __ \____ / /_(_)___ ___ __ _______ / / / / __ \/ __/ / __ `__ \/ / / / ___/ / /_/ / /_/ / /_/ / / / / / / /_/ (__ ) \____/ .___/\__/_/_/ /_/ /_/\__,_/____/ /_/ """) logging.info(STARTING_OPTIMUS) if checkpoint is True: self.set_check_point_folder(path, file_system) logging.info(SUCCESS) self.create = Create() self.load = Load() self.read = self.spark.read self.profiler = Profiler() self.ml = ML()
"https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.csv" ) t.create( op, "load.json", "remote_json", "df", "https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.json" ) t.create( op, "load.parquet", "remote_parquet", "df", "https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.parquet" ) # + from optimus.profiler.profiler import Profiler p = Profiler() print(p.run(source_df1, "japanese name")) # - # df_string = source_df.cols.cast("*","str") t.create(source_df, "save.csv", None, None, "test.csv") t.create(None, "save.json", None, None, "test.json") t.create(None, "save.parquet", None, None, "test.parquet") t.run() source_df.table()
def __init__(self, session=None, master="local[*]", app_name="optimus", checkpoint=False, path=None, file_system="local", verbose=False, server=False, repositories=None, packages=None, jars=None, driver_class_path=None, options=None, additional_options=None, comm=None, load_avro=False, cache=True): """ Transform and roll out :param master: 'Master', 'local' or ip address to a cluster :param app_name: Spark app name :param path: path to the checkpoint folder :param checkpoint: If True create a checkpoint folder :param file_system: 'local' or 'hadoop' :param additional_options: :param options: Configuration options that are passed to spark-submit. See `the list of possible options <https://spark.apache.org/docs/2.4.1/configuration.html#available-properties>`_. Note that any options set already through PYSPARK_SUBMIT_ARGS will override these. :type options: (dict[str,str]) :param repositories: List of additional maven repositories for package lookup. :type repositories: (list[str]) :param packages: Spark packages that should be installed. :type packages: (list[str]) :param jars: Full paths to jar files that we want to include to the session. :type jars: (list[str]) """ self.preserve = False Optimus.cache = cache if comm is True: Comm.instance = Comm() else: Comm.instance = comm if jars is None: jars = [] if driver_class_path is None: driver_class_path = [] if session is None: # Creating Spark Session # If a Spark session in not passed by argument create one self.master = master self.app_name = app_name if options is None: options = {} self.options = options # Initialize as lists self.packages = val_to_list(packages) self.repositories = val_to_list(repositories) self.jars = val_to_list(jars) self.driver_class_path = val_to_list(driver_class_path) self.additional_options = additional_options self.verbose(verbose) # Because avro depends of a external package you can decide if should be loaded if load_avro == "2.4": self._add_spark_packages( ["org.apache.spark:spark-avro_2.12:2.4.3"]) elif load_avro == "2.3": self._add_spark_packages( ["com.databricks:spark-avro_2.11:4.0.0"]) jdbc_jars = [ "/jars/spark-redis-2.4.1-SNAPSHOT-jar-with-dependencies.jar", "/jars/RedshiftJDBC42-1.2.16.1027.jar", "/jars/mysql-connector-java-8.0.16.jar", "/jars/ojdbc8.jar", "/jars/postgresql-42.2.5.jar", "/jars/presto-jdbc-0.224.jar", "/jars/spark-cassandra-connector_2.11-2.4.1.jar", "/jars/sqlite-jdbc-3.27.2.1.jar", "/jars/mssql-jdbc-7.4.1.jre8.jar" ] self._add_jars(absolute_path(jdbc_jars, "uri")) self._add_driver_class_path(absolute_path(jdbc_jars, "posix")) self._create_session() if path is None: path = os.getcwd() if checkpoint is True: self._set_check_point_folder(path, file_system) else: # If a session is passed by arguments just save the reference # logger.print("Spark session") Spark.instance = Spark().load(session) # Initialize Spark logger.print(""" ____ __ _ / __ \____ / /_(_)___ ___ __ _______ / / / / __ \/ __/ / __ `__ \/ / / / ___/ / /_/ / /_/ / /_/ / / / / / / /_/ (__ ) \____/ .___/\__/_/_/ /_/ /_/\__,_/____/ /_/ """) logger.print(STARTING_OPTIMUS) # Pickling Spark.instance.sc.addPyFile(absolute_path("/infer.py")) if server: logger.print("Starting Optimus Server...") s = Server() s.start() self.server_instance = s logger.print(SUCCESS) self.create = Create() self.load = Load() self.read = self.spark.read # Create singleton profiler Profiler.instance = Profiler() self.profiler = Profiler.instance self.ml = ML() # Set global output as html self.output("html")
def func(col_name, attrs): return F.col(col_name) * 2 numeric_col = "height(ft)" numeric_col_B = "rank" numeric_col_C = "rank" string_col = "function" date_col = "date arrival" date_col_B = "last date seen" new_col = "new col" array_col = "attributes" # - from optimus.profiler.profiler import Profiler p = Profiler() p.run(source_df, "*") t.create(p, "dataset", None, 'json', None, source_df, "*") t.run() mismatch = { "names": "dd/mm/yyyy", "height(ft)": r'^([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)\d{4}$', "function": "yyyy-mm-dd" } t.create(p, "dataset",
def func(col_name, attrs): return F.col(col_name) * 2 numeric_col = "height(ft)" numeric_col_B = "rank" numeric_col_C = "rank" string_col = "function" date_col = "date arrival" date_col_B = "last date seen" new_col = "new col" array_col = "attributes" # - from optimus.profiler.profiler import Profiler p = Profiler() from optimus.ml import feature as fe t.create(p, "minimal_stats", None, 'json', None, source_df, "*") t.create(p, "to_json", None, 'json', None, source_df, "*") t.create(p, "columns", None, 'json', None, source_df, "*") t.create(p, "general_stats", None, 'json', None, source_df, "*") t.run() source_df.sample()
def __init__(self, session=None, master="local[*]", app_name="optimus", checkpoint=False, path=None, file_system="local", verbose=False, dl=False, server=False, repositories=None, packages=None, jars=None, options=None, additional_options=None, enricher_host="localhost", enricher_port=27017, queue_url=None, queue_exchange=None, queue_routing_key="optimus"): """ Transform and roll out :param master: 'Master', 'local' or ip address to a cluster :param app_name: Spark app name :param path: path to the checkpoint folder :param checkpoint: If True create a checkpoint folder :param file_system: 'local' or 'hadoop' :param additional_options: :param options: Configuration options that are passed to spark-submit. See `the list of possible options <https://spark.apache.org/docs/2.1.0/configuration.html#available-properties>`_. Note that any options set already through PYSPARK_SUBMIT_ARGS will override these. :type options: (dict[str,str]) :param repositories: List of additional maven repositories for package lookup. :type repositories: (list[str]) :param packages: Spark packages that should be installed. :type packages: (list[str]) :param jars: Full paths to jar files that we want to include to the session. :type jars: (list[str]) """ if session is None: # print("Creating Spark Session...") # If a Spark session in not passed by argument create it self.master = master self.app_name = app_name if options is None: options = {} self.options = options if packages is None: packages = [] else: packages = val_to_list(packages) self.packages = packages self.repositories = repositories if jars is None: jars = {} self.jars = jars self.additional_options = additional_options self.verbose(verbose) # Load Avro. # TODO: if the Spark 2.4 version is going to be used this is not neccesesary. # Maybe we can check a priori which version fo Spark is going to be used # self._add_spark_packages(["com.databricks:spark-avro_2.11:4.0.0"]) if dl is True: self._add_spark_packages( ["databricks:spark-deep-learning:1.5.0-spark2.4-s_2.11"]) self._start_session() from optimus.dl.models import DL self.dl = DL() else: self._start_session() if path is None: path = os.getcwd() if checkpoint is True: self._set_check_point_folder(path, file_system) else: # If a session is passed by arguments just save the reference Spark.instance = session # Initialize Spark logger.print(""" ____ __ _ / __ \____ / /_(_)___ ___ __ _______ / / / / __ \/ __/ / __ `__ \/ / / / ___/ / /_/ / /_/ / /_/ / / / / / / /_/ (__ ) \____/ .___/\__/_/_/ /_/ /_/\__,_/____/ /_/ """) logger.print(STARTING_OPTIMUS) if server: logger.print("Starting Optimus Server...") s = Server() s.start() self.server_instance = s logger.print(SUCCESS) self.create = Create() self.load = Load() self.read = self.spark.read self.profiler = Profiler(queue_url=queue_url, queue_exchange=queue_exchange, queue_routing_key=queue_routing_key) self.ml = ML() self.enricher = Enricher( op=self, host=enricher_host, port=enricher_port, )
def __init__(self, session=None, master="local[*]", app_name="optimus", checkpoint=False, path=None, file_system="local", verbose=False, server=False, repositories=None, packages=None, jars=None, driver_class_path=None, options=None, additional_options=None, queue_url=None, queue_exchange=None, queue_routing_key="optimus" ): """ Transform and roll out :param master: 'Master', 'local' or ip address to a cluster :param app_name: Spark app name :param path: path to the checkpoint folder :param checkpoint: If True create a checkpoint folder :param file_system: 'local' or 'hadoop' :param additional_options: :param options: Configuration options that are passed to spark-submit. See `the list of possible options <https://spark.apache.org/docs/2.4.1/configuration.html#available-properties>`_. Note that any options set already through PYSPARK_SUBMIT_ARGS will override these. :type options: (dict[str,str]) :param repositories: List of additional maven repositories for package lookup. :type repositories: (list[str]) :param packages: Spark packages that should be installed. :type packages: (list[str]) :param jars: Full paths to jar files that we want to include to the session. :type jars: (list[str]) """ if session is None: # print("Creating Spark Session...") # If a Spark session in not passed by argument create it self.master = master self.app_name = app_name if options is None: options = {} self.options = options if packages is None: packages = [] else: packages = val_to_list(packages) self.packages = packages self.repositories = repositories # Jars self.jars = jars self._add_jars(jars) # Class Drive Path self.driver_class_path = driver_class_path self._add_driver_class_path(driver_class_path) # Additional Options self.additional_options = additional_options self.verbose(verbose) # Load Avro. # TODO: # if the Spark 2.4 version is going to be used this is not neccesesary. # Maybe we can check a priori which version fo Spark is going to be used self._add_spark_packages(["com.databricks:spark-avro_2.11:4.0.0"]) def c(files): return [Path(path + file).as_posix() for file in files] path = os.path.dirname(os.path.abspath(__file__)) # Add databases jars self._add_jars(["../jars/RedshiftJDBC42-1.2.16.1027.jar", "../jars/mysql-connector-java-8.0.16.jar", "../jars/ojdbc7.jar", "../jars/postgresql-42.2.5.jar"]) self._add_driver_class_path( c(["//jars//RedshiftJDBC42-1.2.16.1027.jar", "//jars//mysql-connector-java-8.0.16.jar", "//jars//ojdbc7.jar", "//jars//postgresql-42.2.5.jar"])) self._start_session() if path is None: path = os.getcwd() if checkpoint is True: self._set_check_point_folder(path, file_system) else: # If a session is passed by arguments just save the reference Spark.instance = Spark().load(session) # Initialize Spark logger.print(""" ____ __ _ / __ \____ / /_(_)___ ___ __ _______ / / / / __ \/ __/ / __ `__ \/ / / / ___/ / /_/ / /_/ / /_/ / / / / / / /_/ (__ ) \____/ .___/\__/_/_/ /_/ /_/\__,_/____/ /_/ """) logger.print(STARTING_OPTIMUS) if server: logger.print("Starting Optimus Server...") s = Server() s.start() self.server_instance = s logger.print(SUCCESS) self.create = Create() self.load = Load() self.read = self.spark.read self.profiler = Profiler( queue_url=queue_url, queue_exchange=queue_exchange, queue_routing_key=queue_routing_key ) self.ml = ML() # self._load_css() # Set global output as html self.output("html")