def init(): global spark spark = SparkSession.builder.getOrCreate() global sc sc = spark.sparkContext global sqlContext sqlContext = spark._wrapped global p_event_store p_event_store = PEventStore(spark._jsparkSession, sqlContext) cleanup_functions = CleanupFunctions(sqlContext) atexit.register(lambda: cleanup_functions.run()) atexit.register(lambda: sc.stop()) print("Initialized pypio")
try: SparkContext._jvm.org.apache.hadoop.hive.conf.HiveConf() spark = SparkSession.builder.enableHiveSupport().getOrCreate() except py4j.protocol.Py4JError: spark = SparkSession.builder.getOrCreate() except TypeError: spark = SparkSession.builder.getOrCreate() sc = spark.sparkContext sql = spark.sql atexit.register(lambda: sc.stop()) sqlContext = spark._wrapped sqlCtx = sqlContext p_event_store = PEventStore(spark._jsparkSession, sqlContext) def run_pio_workflow(model): template_engine = sc._jvm.org.jpioug.template.python.Engine template_engine.modelRef().set(model._to_java()) main_args = new_string_array(sys.argv, sc._gateway) create_workflow = sc._jvm.org.apache.predictionio.workflow.CreateWorkflow sc.stop() create_workflow.main(main_args) ### END: SETUP ### # In[ ]:
# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from pypio.data import PEventStore p_event_store = PEventStore(spark._jsparkSession, sqlContext)
template_engine = sc._jvm.org.example.vanilla.VanillaEngine template_engine.modelRef().set(model) template_engine.userdictRef().set(userdict) template_engine.itemdictRef().set(itemdict) main_args = utils.toJArray(sc._gateway, sc._gateway.jvm.String, sys.argv) create_workflow = sc._jvm.org.apache.predictionio.workflow.CreateWorkflow sc.stop() create_workflow.main(main_args) sqlContext = spark._wrapped sqlCtx = sqlContext app_name = 'NCF' event_names = utils.toJArray(sc._gateway, sc._gateway.jvm.String, ['purchased-event']) p_event_store = PEventStore(spark._jsparkSession, sqlContext) event_df = p_event_store.find(app_name, entity_type='user', target_entity_type='item', event_names=event_names) ratings = event_df.toPandas().rename(index=str, columns={'entityId': 'userid', 'targetEntityId': 'itemid', 'eventTime': 'timestamp'}) #For running with eval only, drop dupe user-item interactions and users with < 2 interactions ratings = ratings.drop_duplicates(subset=["userid", "itemid"], keep="last") ratings = ratings[ratings.duplicated(subset=['userid'], keep=False)] ratings['rating'] = 1 ratings['userid'] = pd.to_numeric(ratings['userid'].str[5:]).astype(int) ratings['itemid'] = pd.to_numeric(ratings['itemid'].str[6:]).astype(int) ratings['timestamp'] = pd.to_numeric(ratings['timestamp']) #TODO: Hashing trick here instead of dicts
sc = spark.sparkContext sql = spark.sql def pio_cleanup(): sc.stop() sc._jvm.org.apache.predictionio.workflow.CleanupFunctions.run() atexit.register(pio_cleanup) sqlContext = spark._wrapped sqlCtx = sqlContext p_event_store = PEventStore(spark._jsparkSession, sqlContext) def run_pio_workflow(model): template_engine = sc._jvm.org.jpioug.template.python.Engine template_engine.modelRef().set(model._to_java()) main_args = new_string_array(sys.argv, sc._gateway) create_workflow = sc._jvm.org.apache.predictionio.workflow.CreateWorkflow sc.stop() create_workflow.main(main_args) ### END: SETUP ### # In[ ]: