def write(self): """ Interface for saving the content of the :class:`DataFrame` out into external storage. :return: :class:`DataFrameWriter` """ return DataFrameWriter(self)
def update_relation(self): # 更新各类帐号关联 try: # create model saving path path = self.spark.hdfs_base + 'relation' relation_acc = self.spark.load_from_db2('T_POINT_RELATION_ACC') relation_cust = self.spark.load_from_db2('T_POINT_RELATION_CUST') relation_join = relation_acc.join(relation_cust, 'CUST_ID', 'outer') writer = DataFrameWriter(relation_join) writer.save(path, mode='overwrite') Global.logger.info('更新帐号关联Dataframe成功!') return 'success!' except Exception as e: Global.logger.error('更新帐号关联Dataframe失败!') return 'failure! please run again.'
def write(self): """ Interface for saving the content of the :class:`DataFrame` out into external storage. :return :class:`DataFrameWriter` .. note:: Experimental >>> df.write <pyspark.sql.readwriter.DataFrameWriter object at ...> """ return DataFrameWriter(self)
def process(time, rdd): print("========= %s =========" % str(time)) os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.AL32UTF8' try: spark = SparkSession.builder.config(conf=rdd.context.getConf()).getOrCreate() rowRdd = rdd.map(lambda w: json.dumps(w)) wplocation = spark.read.json(rowRdd) print('wplocation',type(wplocation),wplocation.dtypes) #wplocation.show() wplocation.createOrReplaceTempView("tmp_kafka_wp") #print('tmp_kafka_wp',type(tmp_kafka_wp)) #sql_kafka_wp = spark.sql("SELECT * FROM tmp_kafka_wp") #print('sql_kafka_wp',type(sql_kafka_wp)) sqlDF = spark.sql("SELECT T1.ZJHM,T1.WPHM,T2.GD_JD,T2.GD_WD,T2.CJDBH,T2.CJDMC,T2.CJDDZ,T2.GEOHASH6 GD_GEOHASH6,T2.GEOHASH GD_GEOHASH,T2.CJSJ,UNIX_TIMESTAMP(T2.CJSJ) CJSJ_INT,T1.ZDRYBS SJDW,T1.LX_DM,T1.LX,T2.XZQH_DM,T1.ZDRYBS,T1.ZDRYXM FROM TMP_LJD_SFZ_WP_DICT T1 JOIN TMP_KAFKA_WP T2 ON T1.WPHM=T2.WPHM AND T1.LX_DM=T2.LX_DM") #sqlDF = spark.sql("SELECT t1.ZJHM ,t1.WPHM FROM tmp_ljd_sfz_wp_dict t1 join tmp_kafka_wp t2 on t1.WPHM=t2.WPHM and t1.lx_dm=t2.lx_dm") print('sqlDF',type(sqlDF)) sqlDF.show() dtw = DataFrameWriter(sqlDF) print(10000) dtw.jdbc(url=url, table='LJD_SFZ_RESULT_NEW', mode='append', properties=properties) print(11111) conn = cx_Oracle.connect(user,pwd,host+":1521/orcl") sql1="TRUNCATE TABLE LJD_SFZ_RESULT_LATEST " sql2='''insert into LJD_SFZ_RESULT_LATEST select ZJHM,WPHM,GD_JD,GD_WD,CJDBH,CJDMC,CJDDZ,GD_GEOHASH6,GD_GEOHASH, CJSJ,CJSJ_INT,SJDW,LX_DM,LX,XZQH_DM,ZDRYBS,ZDRYXM from( select t.*,row_number() over (partition by t.zjhm order by cjsj_int desc) rn from LJD_SFZ_RESULT_NEW t )tt where tt.rn=1''' sql3='''merge into LJD_SFZ_RESULT t1 using LJD_SFZ_RESULT_LATEST t2 on (t1.ZJHM=t2.ZJHM) when matched then update set t1.WPHM=t2.WPHM, t1.GD_JD=t2.GD_JD, t1.GD_WD=t2.GD_WD, t1.CJDBH=t2.CJDBH, t1.CJDMC=t2.CJDMC, t1.CJDDZ=t2.CJDDZ, t1.GD_GEOHASH6=t2.GD_GEOHASH6, t1.GD_GEOHASH=t2.GD_GEOHASH, t1.CJSJ=t2.CJSJ, t1.CJSJ_INT=t2.CJSJ_INT, t1.SJDW=t2.SJDW, t1.LX_DM=t2.LX_DM, t1.LX=t2.LX, t1.XZQH_DM=t2.XZQH_DM, t1.ZDRYBS=t2.ZDRYBS, t1.ZDRYXM=t2.ZDRYXM when not matched then insert values(t2.ZJHM,t2.WPHM,t2.GD_JD,t2.GD_WD,t2.CJDBH,t2.CJDMC,t2.CJDDZ,t2.GD_GEOHASH6,t2.GD_GEOHASH, t2.CJSJ,t2.CJSJ_INT,t2.SJDW,t2.LX_DM,t2.LX,t2.XZQH_DM,t2.ZDRYBS,t2.ZDRYXM)''' sql4="TRUNCATE TABLE LJD_SFZ_RESULT_NEW " print(22222) conn.cursor().execute(sql1) conn.cursor().execute(sql2) conn.cursor().execute(sql3) conn.cursor().execute(sql4) #conn.commit() #conn = cx_Oracle.connect(user,pwd,host+":1521/orcl") # #sqlDF = spark.sql("SELECT t1.zjhm,t1.wphm,t2.gd_jd,t2.gd_wd,t2.cjdbh,t2.cjdmc,t2.cjddz,t2.geohash6,t2.geohash,t2.cjsj,unix_timestamp(t2.cjsj) cjsj_int,t1.lx_dm,t1.lx FROM tmp_ljd_sfz_wp_dict t1 join tmp_kafka_wp t2 on t1.WPHM=t2.WPHM and t1.lx_dm=t2.lx_dm") #wpinfo = sqlDF.rdd.map(lambda p: "'"+p.zjhm+"','"+p.wphm+"',"+str(p.gd_jd)+","+str(p.gd_wd)+",'"+p.cjdbh+"','"+p.cjdmc+"','"+p.cjddz+"','"+p.geohash6+"','"+p.geohash+"','"+p.cjsj+"',"+str(p.cjsj_int)+",'1','"+p.lx_dm+"','"+p.lx+"'").collect() #for i in wpinfo: # conn.cursor().execute("insert into " + t_write + "(zjhm,wphm,gd_jd,gd_wd,cjdbh,cjdmc,cjddz,gd_geohash6,gd_geohash,cjsj,cjsj_int,Sjdw,Lx_dm,lx) VALUES (" + i +")") #conn.commit() print(33333) except: print('eeeee') pass
marketing = client_final.agg(sum("MOM").alias("MOM"), sum("SINGLE").alias("SINGLE"), sum("PET_FRIENDLY").alias("PET_FRIENDLY"), sum("COMPLETE_MYSTERY").alias("COMPLETE_MYSTERY")) ##MOM ms = client_final.filter((col("MOM")==1) & (col("SINGLE")==1)) mp = client_final.filter((col("MOM")==1) & (col("PET_FRIENDLY")==1)) ##SINGLE sp = client_final.filter((col("SINGLE")==1) & (col("PET_FRIENDLY")==1)) marketing2 = marketing.withColumn("MOM_SINGLE", lit(ms.count())).withColumn("MOM_PET_FRIENDLY", lit(mp.count())).withColumn("SINGLE_PET_FRIENDLY", lit(sp.count())) marketing3 = marketing2.withColumn("MOM", col("MOM")-col("MOM_PET_FRIENDLY")).withColumn("SINGLE", col("SINGLE")-col("SINGLE_PET_FRIENDLY")).withColumn("PET_FRIENDLY", col("PET_FRIENDLY")-col("MOM_PET_FRIENDLY")-col("SINGLE_PET_FRIENDLY")) print("ALL OK") head_final.write.mode("overwrite").saveAsTable("orderdb.ORDER_HEADER") det_final.write.mode("overwrite").saveAsTable("orderdb.ORDER_DETAIL") client_final.write.mode("overwrite").saveAsTable("orderdb.CLIENT") """ ##CONECTION AND SAVE... PENDING jdbcHostname = "orderservers.database.windows.net" jdbcPort = "1433" jdbcDatabase = "orderdb" properties = {"user" : "etlguest","password" : "Etltest_2020" } url = "jdbc:sqlserver://{0}:{1};database={2}".format(jdbcHostname,jdbcPort,jdbcDatabase) head_final1 = DataFrameWriter(head_final) head_final1.jdbc(url=url, table= "ORDER_HEADER", mode ="overwrite", properties = properties) det_final1 = DataFrameWriter(det_final) det_final1.jdbc(url=url, table= "ORDER_DETAIL", mode ="overwrite", properties = properties) client_final1 = DataFrameWriter(client_final) client_final1.jdbc(url=url, table= "CLIENT", mode ="overwrite", properties = properties)
#pyspark连hive,oracle from pyspark.sql.readwriter import DataFrameWriter,DataFrameReader user = '******' pwd = 'test' url = 'jdbc:oracle:thin:@192.168.1.225:1521:ORCL' #host = '192.168.1.225' #url = 'jdbc:oracle:thin:@%s:1521:ORCL' % host properties = {'user': user, 'password': pwd, 'driver': 'oracle.jdbc.driver.OracleDriver'} #oracle数据写回oracle dtr = DataFrameReader(sqlContext) sf_car_test = dtr.jdbc(url=url, table='sf_car_test1', properties=properties) #sf_car_test = spark.read.jdbc(url=url, table='sf_car_test1', properties=properties) print('sf_car_test',type(sf_car_test)) sf_car_test.show() dtw = DataFrameWriter(sf_car_test) dtw.jdbc(url=url, table='sf_car_test2', mode='overwrite', properties=properties) #dtw.jdbc(url=url, table='sf_car_test2', mode='append', properties=properties) #sf_car_test.write.jdbc(url=url, table='sf_car_test2', properties=properties) #append 方式写入 #sf_car_test.write.mode(saveMode="overwrite").jdbc(url=url, table='sf_car_test2', properties=properties) #overwrite 方式写入 #转换后的表写回oracle sf_car_test.createOrReplaceTempView("sf_car") sf_car = spark.sql("SELECT gmsfhm,hphm FROM sf_car ") print('sf_car',type(sf_car)) sf_car.show() sf_car.write.jdbc(url=url, table='sf_car_test2', properties=properties) dtw = DataFrameWriter(sf_car) dtw.jdbc(url=url, table='sf_car_test4', mode='overwrite', properties=properties)