def test_gen_cats_from_items(self): spark = OrcaContext.get_spark_session() sc = OrcaContext.get_spark_context() data = [ ("jack", [1, 2, 3, 4, 5]), ("alice", [4, 5, 6, 7, 8]), ("rose", [1, 2])] schema = StructType([ StructField("name", StringType(), True), StructField("item_hist_seq", ArrayType(IntegerType()), True)]) df = spark.createDataFrame(data, schema) df.filter("name like '%alice%'").show() df2 = sc \ .parallelize([(0, 0), (1, 0), (2, 0), (3, 0), (4, 1), (5, 1), (6, 1), (8, 2), (9, 2)]) \ .toDF(["item", "category"]).withColumn("item", col("item").cast("Integer")) \ .withColumn("category", col("category").cast("Integer")) tbl = FeatureTable(df) tbl2 = tbl.add_neg_hist_seq(9, "item_hist_seq", 4) tbl3 = tbl2.add_feature(["item_hist_seq", "neg_item_hist_seq"], FeatureTable(df2), 5) assert tbl3.df.select("category_hist_seq").count() == 3 assert tbl3.df.select("neg_category_hist_seq").count() == 3 assert tbl3.df.filter("name like '%alice%'").select("neg_category_hist_seq").count() == 1 assert tbl3.df.filter("name == 'rose'").select("neg_category_hist_seq").count() == 1
def __init__(self, hosts=None, processes_per_node=1, env=None): driver_ip = get_node_ip() if hosts is None: # Single node self.hosts = [driver_ip] elif hosts == "all": # All executor nodes in the cluster def get_ip(iter): yield get_node_ip() from bigdl.util.common import get_node_and_core_number from zoo.orca import OrcaContext sc = OrcaContext.get_spark_context() node_num, core_num = get_node_and_core_number() total_cores = node_num * core_num self.hosts = list(set(sc.range(0, total_cores, numSlices=total_cores).barrier() .mapPartitions(get_ip).collect())) else: # User specified hosts, assumed to be non-duplicate assert isinstance(hosts, list) self.hosts = hosts self.master = self.hosts[0] print("Master: ", self.master) self.remote_hosts = [] for host in self.hosts: if host != driver_ip: self.remote_hosts.append(host) print("Remote hosts: ", self.remote_hosts) print("Hosts: ", self.hosts) self.processes_per_node = processes_per_node self.env = env if env else {}
def test_dataframe_with_empty_partition(self): from zoo.orca import OrcaContext sc = OrcaContext.get_spark_context() rdd = sc.range(0, 10) rdd_with_empty = rdd.repartition(4).\ mapPartitionsWithIndex(lambda idx, part: [] if idx == 0 else part) from pyspark.sql import SparkSession spark = SparkSession(sc) from pyspark.ml.linalg import DenseVector df = rdd_with_empty.map(lambda x: (DenseVector(np.random.randn(1,).astype(np.float)), int(np.random.randint(0, 1, size=()))))\ .toDF(["feature", "label"]) config = {"lr": 0.8} trainer = Estimator.from_keras(model_creator=model_creator, verbose=True, config=config, workers_per_node=2) trainer.fit(df, epochs=1, batch_size=4, steps_per_epoch=25, feature_cols=["feature"], label_cols=["label"]) trainer.evaluate(df, batch_size=4, num_steps=25, feature_cols=["feature"], label_cols=["label"]) trainer.predict(df, feature_cols=["feature"]).collect()
def __init__(self, hosts=None, processes_per_node=1, env=None): driver_ip = get_node_ip() if hosts is None: # Single node self.hosts = [driver_ip] elif hosts == "all": # All executor nodes in the cluster def get_ip(iter): yield get_node_ip() from zoo.orca import OrcaContext sc = OrcaContext.get_spark_context() master = sc.getConf().get("spark.master") if master == "local" or master.startswith("local["): num_executors = 1 else: num_executors = int( sc.getConf().get("spark.executor.instances")) self.hosts = list( set( sc.range(0, num_executors, numSlices=num_executors).barrier().mapPartitions( get_ip).collect())) else: # User specified hosts, assumed to be non-duplicate assert isinstance(hosts, list) self.hosts = hosts self.master = self.hosts[0] print("Master: ", self.master) self.remote_hosts = [] for host in self.hosts: if host != driver_ip: self.remote_hosts.append(host) print("Remote hosts: ", self.remote_hosts) print("Hosts: ", self.hosts) self.processes_per_node = processes_per_node self.env = env if env else {}