예제 #1
0
    def write(self):
        """
        Interface for saving the content of the :class:`DataFrame` out into external storage.

        :return: :class:`DataFrameWriter`
        """
        return DataFrameWriter(self)
예제 #2
0
    def update_relation(self):
        # 更新各类帐号关联
        try:
            # create model saving path
            path = self.spark.hdfs_base + 'relation'

            relation_acc = self.spark.load_from_db2('T_POINT_RELATION_ACC')
            relation_cust = self.spark.load_from_db2('T_POINT_RELATION_CUST')
            relation_join = relation_acc.join(relation_cust, 'CUST_ID', 'outer')

            writer = DataFrameWriter(relation_join)
            writer.save(path, mode='overwrite')
            Global.logger.info('更新帐号关联Dataframe成功!')
            return 'success!'
        except Exception as e:
            Global.logger.error('更新帐号关联Dataframe失败!')
            return 'failure! please run again.'
예제 #3
0
    def write(self):
        """
        Interface for saving the content of the :class:`DataFrame` out
        into external storage.

        :return :class:`DataFrameWriter`

        .. note:: Experimental

        >>> df.write
        <pyspark.sql.readwriter.DataFrameWriter object at ...>
        """
        return DataFrameWriter(self)
예제 #4
0
def process(time, rdd):
    print("========= %s =========" % str(time))
    os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.AL32UTF8'
    try:
        spark = SparkSession.builder.config(conf=rdd.context.getConf()).getOrCreate()
        rowRdd = rdd.map(lambda w: json.dumps(w))
        wplocation = spark.read.json(rowRdd)
        print('wplocation',type(wplocation),wplocation.dtypes)
        #wplocation.show()
        wplocation.createOrReplaceTempView("tmp_kafka_wp")
        #print('tmp_kafka_wp',type(tmp_kafka_wp))
        #sql_kafka_wp = spark.sql("SELECT * FROM tmp_kafka_wp")
        #print('sql_kafka_wp',type(sql_kafka_wp))
        
        sqlDF = spark.sql("SELECT T1.ZJHM,T1.WPHM,T2.GD_JD,T2.GD_WD,T2.CJDBH,T2.CJDMC,T2.CJDDZ,T2.GEOHASH6 GD_GEOHASH6,T2.GEOHASH GD_GEOHASH,T2.CJSJ,UNIX_TIMESTAMP(T2.CJSJ) CJSJ_INT,T1.ZDRYBS SJDW,T1.LX_DM,T1.LX,T2.XZQH_DM,T1.ZDRYBS,T1.ZDRYXM FROM TMP_LJD_SFZ_WP_DICT T1 JOIN TMP_KAFKA_WP T2 ON T1.WPHM=T2.WPHM AND T1.LX_DM=T2.LX_DM")
        #sqlDF = spark.sql("SELECT t1.ZJHM ,t1.WPHM  FROM tmp_ljd_sfz_wp_dict t1 join tmp_kafka_wp t2 on t1.WPHM=t2.WPHM and t1.lx_dm=t2.lx_dm")
        
        print('sqlDF',type(sqlDF))
        sqlDF.show()
        dtw = DataFrameWriter(sqlDF)
        print(10000)
        dtw.jdbc(url=url, table='LJD_SFZ_RESULT_NEW', mode='append', properties=properties)
        print(11111)
        conn = cx_Oracle.connect(user,pwd,host+":1521/orcl")
        sql1="TRUNCATE TABLE LJD_SFZ_RESULT_LATEST "
        sql2='''insert into LJD_SFZ_RESULT_LATEST
        select ZJHM,WPHM,GD_JD,GD_WD,CJDBH,CJDMC,CJDDZ,GD_GEOHASH6,GD_GEOHASH,
        CJSJ,CJSJ_INT,SJDW,LX_DM,LX,XZQH_DM,ZDRYBS,ZDRYXM
        from(
        select t.*,row_number() over (partition by t.zjhm order by cjsj_int desc) rn
        from LJD_SFZ_RESULT_NEW t
        )tt 
        where tt.rn=1'''
        
        sql3='''merge into LJD_SFZ_RESULT t1 using LJD_SFZ_RESULT_LATEST t2 on (t1.ZJHM=t2.ZJHM)
        when matched then
        update set 
        t1.WPHM=t2.WPHM,
        t1.GD_JD=t2.GD_JD,
        t1.GD_WD=t2.GD_WD,
        t1.CJDBH=t2.CJDBH,
        t1.CJDMC=t2.CJDMC,
        t1.CJDDZ=t2.CJDDZ,
        t1.GD_GEOHASH6=t2.GD_GEOHASH6,
        t1.GD_GEOHASH=t2.GD_GEOHASH,
        t1.CJSJ=t2.CJSJ,
        t1.CJSJ_INT=t2.CJSJ_INT,
        t1.SJDW=t2.SJDW,
        t1.LX_DM=t2.LX_DM,
        t1.LX=t2.LX,
        t1.XZQH_DM=t2.XZQH_DM,
        t1.ZDRYBS=t2.ZDRYBS,
        t1.ZDRYXM=t2.ZDRYXM
        when not matched then
        insert values(t2.ZJHM,t2.WPHM,t2.GD_JD,t2.GD_WD,t2.CJDBH,t2.CJDMC,t2.CJDDZ,t2.GD_GEOHASH6,t2.GD_GEOHASH,
        t2.CJSJ,t2.CJSJ_INT,t2.SJDW,t2.LX_DM,t2.LX,t2.XZQH_DM,t2.ZDRYBS,t2.ZDRYXM)'''
        sql4="TRUNCATE TABLE LJD_SFZ_RESULT_NEW "
        print(22222)
        conn.cursor().execute(sql1)
        conn.cursor().execute(sql2)
        conn.cursor().execute(sql3)
        conn.cursor().execute(sql4)
        #conn.commit()

        #conn = cx_Oracle.connect(user,pwd,host+":1521/orcl")
        #
        #sqlDF = spark.sql("SELECT t1.zjhm,t1.wphm,t2.gd_jd,t2.gd_wd,t2.cjdbh,t2.cjdmc,t2.cjddz,t2.geohash6,t2.geohash,t2.cjsj,unix_timestamp(t2.cjsj) cjsj_int,t1.lx_dm,t1.lx FROM tmp_ljd_sfz_wp_dict t1 join tmp_kafka_wp t2 on t1.WPHM=t2.WPHM and t1.lx_dm=t2.lx_dm")
        #wpinfo = sqlDF.rdd.map(lambda p: "'"+p.zjhm+"','"+p.wphm+"',"+str(p.gd_jd)+","+str(p.gd_wd)+",'"+p.cjdbh+"','"+p.cjdmc+"','"+p.cjddz+"','"+p.geohash6+"','"+p.geohash+"','"+p.cjsj+"',"+str(p.cjsj_int)+",'1','"+p.lx_dm+"','"+p.lx+"'").collect()
        #for i in wpinfo:
        #    conn.cursor().execute("insert into " + t_write + "(zjhm,wphm,gd_jd,gd_wd,cjdbh,cjdmc,cjddz,gd_geohash6,gd_geohash,cjsj,cjsj_int,Sjdw,Lx_dm,lx) VALUES (" + i  +")")
        #conn.commit()
        print(33333)

    except:
        print('eeeee')
        pass
예제 #5
0
파일: etl.py 프로젝트: init20/ETL_TEST
marketing = client_final.agg(sum("MOM").alias("MOM"), sum("SINGLE").alias("SINGLE"), sum("PET_FRIENDLY").alias("PET_FRIENDLY"), sum("COMPLETE_MYSTERY").alias("COMPLETE_MYSTERY"))
##MOM
ms = client_final.filter((col("MOM")==1) & (col("SINGLE")==1))
mp = client_final.filter((col("MOM")==1) & (col("PET_FRIENDLY")==1))
##SINGLE
sp = client_final.filter((col("SINGLE")==1) & (col("PET_FRIENDLY")==1))
marketing2 = marketing.withColumn("MOM_SINGLE", lit(ms.count())).withColumn("MOM_PET_FRIENDLY", lit(mp.count())).withColumn("SINGLE_PET_FRIENDLY", lit(sp.count()))
marketing3 = marketing2.withColumn("MOM", col("MOM")-col("MOM_PET_FRIENDLY")).withColumn("SINGLE", col("SINGLE")-col("SINGLE_PET_FRIENDLY")).withColumn("PET_FRIENDLY", col("PET_FRIENDLY")-col("MOM_PET_FRIENDLY")-col("SINGLE_PET_FRIENDLY"))

print("ALL OK")
head_final.write.mode("overwrite").saveAsTable("orderdb.ORDER_HEADER")
det_final.write.mode("overwrite").saveAsTable("orderdb.ORDER_DETAIL")
client_final.write.mode("overwrite").saveAsTable("orderdb.CLIENT")

"""
##CONECTION AND SAVE... PENDING
jdbcHostname = "orderservers.database.windows.net"
jdbcPort = "1433"
jdbcDatabase = "orderdb"
properties = {"user" : "etlguest","password" : "Etltest_2020" }

url = "jdbc:sqlserver://{0}:{1};database={2}".format(jdbcHostname,jdbcPort,jdbcDatabase)
head_final1 = DataFrameWriter(head_final)
head_final1.jdbc(url=url, table= "ORDER_HEADER", mode ="overwrite", properties = properties)

det_final1 = DataFrameWriter(det_final)
det_final1.jdbc(url=url, table= "ORDER_DETAIL", mode ="overwrite", properties = properties)

client_final1 = DataFrameWriter(client_final)
client_final1.jdbc(url=url, table= "CLIENT", mode ="overwrite", properties = properties)
예제 #6
0
#pyspark连hive,oracle
from pyspark.sql.readwriter import DataFrameWriter,DataFrameReader

user = '******'
pwd = 'test'
url = 'jdbc:oracle:thin:@192.168.1.225:1521:ORCL'
#host = '192.168.1.225'
#url = 'jdbc:oracle:thin:@%s:1521:ORCL' % host
properties = {'user': user, 'password': pwd, 'driver': 'oracle.jdbc.driver.OracleDriver'}
#oracle数据写回oracle
dtr = DataFrameReader(sqlContext)
sf_car_test = dtr.jdbc(url=url, table='sf_car_test1', properties=properties)
#sf_car_test = spark.read.jdbc(url=url, table='sf_car_test1', properties=properties)
print('sf_car_test',type(sf_car_test))
sf_car_test.show()
dtw = DataFrameWriter(sf_car_test)
dtw.jdbc(url=url, table='sf_car_test2', mode='overwrite', properties=properties)
#dtw.jdbc(url=url, table='sf_car_test2', mode='append', properties=properties)
#sf_car_test.write.jdbc(url=url, table='sf_car_test2', properties=properties)  #append 方式写入
#sf_car_test.write.mode(saveMode="overwrite").jdbc(url=url, table='sf_car_test2', properties=properties)  #overwrite 方式写入


#转换后的表写回oracle
sf_car_test.createOrReplaceTempView("sf_car")
sf_car = spark.sql("SELECT gmsfhm,hphm FROM sf_car ")
print('sf_car',type(sf_car))
sf_car.show()
sf_car.write.jdbc(url=url, table='sf_car_test2', properties=properties)

dtw = DataFrameWriter(sf_car)
dtw.jdbc(url=url, table='sf_car_test4', mode='overwrite', properties=properties)