Пример #1
0
def getUserInfo(user_id_list):
    if len(user_id_list) > 0:
        user_id_str = ",".join(user_id_list)
        sql = "select hn.hnuserid,hn.linkman,hnaccount.useraccount," \
          "hnaccount.mobile,hn.risklevel,hn.createtime,hnaccount.lastlogontime," \
          "hnblack.status from risk_business_source.hnuser hn " \
          "left join risk_business_source.hnuseraccount hnaccount on hn.hnuserid = hnaccount.hnuserid " \
          "left join risk_business_source.hn_user_black hnblack on hn.hnuserid = hnblack.hn_user_id " \
          "where hn.hnuserid in(" + user_id_str + ")"
        sqlDF = spark.sql(sql)
        rows = sqlDF.collect()
        rows_count = len(rows)
        result = []
        cols = [
            '用户ID', '用户名', '用户账号', '用户手机号', '风险等级', '注册时间', '最后一次登录时间',
            '是否在黑名单'
        ]
        for i in range(rows_count):
            r = []
            r.append(rows[i]['hnuserid'])
            r.append(rows[i]['linkman'])
            r.append(rows[i]['useraccount'])
            r.append(rows[i]['mobile'])
            r.append(rows[i]['risklevel'])
            r.append(rows[i]['createtime'])
            r.append(rows[i]['lastlogontime'])
            r.append(rows[i]['status'])
            result.append(r)
        return cols, result
    else:
        print("the user list is is empty")
Пример #2
0
def use_case_1(which_sql):
    """
    机动车业务办理数据统计

    按照时间(年、月)、大队、业务类型统计记录数量
    :param which_sql:
    :return:
    """
    st = datetime.now()

    # from pyspark import SparkConf
    from pyspark import SparkContext
    from pyspark import SQLContext
    st_1 = datetime.now()
    sql = get_sql(which_sql)
    st_2 = datetime.now()

    print("Got sql: {sql}".format(sql=sql))

    # conf = SparkConf().setAppName("testing").setMaster("local[2]")
    sc = SparkContext().getOrCreate()

    sqlcontext = SQLContext(sc)
    st_3 = datetime.now()

    # For use sql, must create a table or view

    # Load vio_violation_data

    # df_vio.cre('vio_violation_temp_view_001') # Table or view doesn't exits 看来还有什么视图存在时间,访问权限之类的?
    # df_vio.registerTempTable() # ???

    # df_vio = SQLContext.table("vio_violation_temp_view_001")

    # Load
    # File path must be given
    df_veh_flow = sqlcontext.read.load(
        '/srv/BigData/dbdata_service/ffk/veh_flow_to_190805_allf')
    st_4 = datetime.now()

    df_veh_flow.createOrReplaceTempView(
        'veh_flow_temp_view_002')  # Create a view
    st_5 = datetime.now()

    # execute sql
    from pyspark.shell import spark
    df_veh_flow_key1 = spark.sql(sql)
    st_6 = datetime.now()

    df_veh_flow_key1.show()
    st_7 = datetime.now()
    print('运行时间统计:\nLoad sql: {0} '
          '\nGet sqlcontext: {1}'
          '\nLoad data: {2}'
          '\nCreate table or view: {3}'
          '\nExecute sql: {4}'
          '\nShow result: {5}'.format(st_2 - st_1, st_3 - st_2, st_4 - st_3,
                                      st_5 - st_4, st_6 - st_5, st_7 - st_6))
    keep_task_run_time(start_time=st)
Пример #3
0
def main():
    """
    :return: Place and magnitude, where magnitude is greater than 1.0.
    """
    df = sqlContext.read.json(load_json_data(url))
    df.createOrReplaceTempView('earthquakes')
    earthquakes_df = spark.sql("SELECT properties.mag, properties.place "
                               "FROM earthquakes "
                               "WHERE properties.mag > 1.0")
    earthquakes_df.show()
Пример #4
0
def save_data(rdd):
    global flag
    flag = False
    """
    Parsing JSON value in each RDDs
    Creating Spark SQL DataFrame from RDD
    Writing DataFrame to HDFS and Oracle DB
    """
    if not rdd.isEmpty():
        rdd = rdd.map(lambda m: parse(m[1]))
        df = sqlContext.createDataFrame(rdd)
        df.createOrReplaceTempView("t")
        result = spark.sql(
            '''select event_id, event_type from (select row_number() over (partition by _1 order by _2) as RN,
			_1 as event_id,_2 as event_type from t)
			   where RN = 1''')

        count = result.count()

        try:
            # Writing to HDFS
            result.write \
                .format("csv") \
                .mode("append") \
                .option("header", "true") \
                .save(HDFS_OUTPUT_PATH)

            # Writing to Oracle DB
            result.write \
                .format("jdbc") \
                .mode("append") \
                .option("driver", DRIVER) \
                .option("url", URL_TARGET_DB) \
                .option("dbtable", TARGET_DB_TABLE_NAME) \
                .option("user", TARGET_DB_USER_NAME) \
                .option("password", TARGET_DB_USER_PASSWORD) \
                .save()

            write_log('INFO', 'Consumer_dim_event_type.py', 'main',
                      '{} rows inserted successfully'.format(count))

        except Exception as e:
            print('--> It seems an Error occurred: {}'.format(e))
            write_log('ERROR', 'Consumer_dim_event_type.py', 'main',
                      str(e)[:1000])
            flag = True
    else:
        ssc.stop()
    return rdd
Пример #5
0
def save_data(rdd):
    """ Function for saving data in window """
    global NAMES
    if not rdd.isEmpty():
        # parsing data in RDD
        rdd = rdd \
            .map(lambda x: parser.parse(x[1])) \
            .map(lambda data: collect(data)) \
            .reduceByKey(lambda rec1, rec2: max(rec1, rec2, key=last_record))
        NAMES = dict(rdd.collect())
        print(
            "************************************> NAMES <************************************"
        )
        print(NAMES)
        print(
            "************************************> NAMES <************************************"
        )
        rdd = rdd \
            .map(lambda rec: (rec[0], rec[1][0], rec[1][1], rec[1][2]))
        # create DataFrame and View
        df = sqlContext.createDataFrame(rdd)
        df.createOrReplaceTempView("t")
        # query for getting result
        res = spark.sql(
            'select t._1 as NAME, t._2 as COUNT_NAME, t._3 as AVG_TRAFFIC, t._4 as AVG_SUCCESS_SELL from t'
        )
        res.show(40)
        # res.printSchema()
        # res = spark.sql('select count(*) KEY, sum(t._2) VALUE from t')
        res \
            .write \
            .format("jdbc") \
            .mode("overwrite") \
            .option("driver", 'oracle.jdbc.OracleDriver') \
            .option("url", "jdbc:oracle:thin:@{0}:{1}:orcl".format(IP_DB, PORT_DB)) \
            .option("dbtable", "tmp_kafka") \
            .option("user", "kozyar") \
            .option("password", "usertest") \
            .save()
        # spark.catalog.dropTempView("t")
    return rdd
Пример #6
0
def use_case_2(which_sql):
    """

    :return:
    """
    # todo
    st = datetime.now()

    # from pyspark import SparkConf
    from pyspark import SparkContext
    from pyspark import SQLContext
    st_1 = datetime.now()
    sql = get_sql(which_sql)
    st_2 = datetime.now()

    print("Got sql: {sql}".format(sql=sql))

    # conf = SparkConf().setAppName("testing").setMaster("local[2]")
    sc = SparkContext().getOrCreate()

    sqlcontext = SQLContext(sc)
    st_3 = datetime.now()

    df_veh = sqlcontext.read.load('/srv/BigData/dbdata_service/ffk/vehicle_to_190805_allf')
    df_vio_op_check = sqlcontext.read.load('/srv/BigData/dbdata_service/ffk/vio_operate_check_to_190805_allf_parquet')

    # 注意这里是csv不是parquet
    # df_veh = sqlcontext.read.load('/srv/BigData/dbdata_service/ffk/vehicle_to190804_allf', format="csv", sep=",", inferSchema="true", header="true")
    # df_vio_op_check = sqlcontext.read.load('/srv/BigData/dbdata_service/ffk/vio_operate_check_to_190805_allf', format="csv", sep=",", inferSchema="true", header="true")

    st_4 = datetime.now()

    df_veh.createOrReplaceTempView('veh_temp_view_002')  # Create a view
    df_vio_op_check.createOrReplaceTempView('vio_operate_check_view_002')  # Create a view

    # df_veh.createGlobalTempView('veh_temp_view_002')
    # df_vio_op_check.createGlobalTempView('vio_operate_check_view_002')
    st_5 = datetime.now()

    # execute sql
    from pyspark.shell import spark
    df_veh_flow_key1 = spark.sql(sql)
    st_6 = datetime.now()

    df_veh_flow_key1.show()

    # 保存df结果到csv文件后返回给客户;u'd
    # new_rdd = df_veh_flow_key1.rdd.map(lambda x: (x[0], x))
    # dict = new_rdd.collectAsMap()

    # def print_rows(row):
    #     data = json.loads(row)
    #     for key in data:
    #         print("{key}:{value}".format(key=key, value=data[key]))
    # results = df_veh_flow_key1.toJSON()
    # results.foreach(print_rows)

    print('turn to json')
    results = df_veh_flow_key1.toJSON().map(lambda j: json.loads(j)).collect()
    print('got json', results)
    for i in results:
        print(i)
        # print(i["c1"], i["c6"])
    # data = df.toPandas().to_csv('mycsv.csv')

    # Spark 1.3
    # df_veh_flow_key1.save('mycsv.csv', 'com.databricks.spark.csv')
    # Spark 1.4+
    # df_veh_flow_key1.write.format('com.databricks.spark.csv').save('mycsv.csv')
    # Spark 2.0+
    # df_veh_flow_key1.write.csv('mycsv.csv')
    # pyspark.sql.utils.AnalysisException: u'path hdfs://hacluster/user/TestUser001/mycsv.csv already exists.;'
    # Send result to http

    import requests
    resp = requests.post('http://10.57.98.251:8008/some/url/404/', json=results)
    print(resp)
    st_7 = datetime.now()
    print('运行时间统计:\nLoad sql: {0} '
          '\nGet sqlcontext: {1}'
          '\nLoad data: {2}'
          '\nCreate table or view: {3}'
          '\nExecute sql: {4}'
          '\nShow result: {5}'.format(st_2 - st_1, st_3 - st_2, st_4 - st_3, st_5 - st_4, st_6 - st_5, st_7 - st_6))
Пример #7
0
from pyspark.shell import spark
from pyspark.sql import Row

sc = spark.sparkContext

# Load a text file and convert each line to a Row.
lines = sc.textFile("examples/src/main/resources/people.txt")
print('*****************')
print(type(lines))
parts = lines.map(lambda l: l.split(","))
people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))
print(type(people))
print(people)
# Infer the schema, and register the DataFrame as a table.
schemaPeople = spark.createDataFrame(people)
schemaPeople.createOrReplaceTempView("people")

# SQL can be run over DataFrames that have been registered as a table.
teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")

# The results of SQL queries are Dataframe objects.
# rdd returns the content as an :class:`pyspark.RDD` of :class:`Row`.
teenNames = teenagers.rdd.map(lambda p: "Name: " + p.name).collect()
for name in teenNames:
    print(name)
Пример #8
0
flightData2015.sort("count").explain()

# Play with partition configuration
spark.conf.set("spark.sql.shuffle.partitions", "5")
print(flightData2015.sort("count").take(2), "\n")

# RDD.explain() will show the execution plan
flightData2015.sort("count").explain()

# Register a DF into a table
flightData2015.createOrReplaceTempView("flight_data_2015")

# Spark SQL way: a new DF is created by using Spark SQL
sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
""")

# Spark DF way: a new DF dataFrameWay is created with transformation
dataFrameWay = flightData2015\
.groupBy("DEST_COUNTRY_NAME")\
.count()

# Both explain() shows the same plan of transformation
sqlWay.explain()
dataFrameWay.explain()

# Spark DF way: to find the max value of column 'count'
print(flightData2015.select(max("count")).take(1))
print('\n')
Пример #9
0
def save_data(rdd):
    global flag
    flag = False
    """
    Parsing JSON value in each RDDs
    Creating Spark SQL DataFrame from RDD
    Writing DataFrame to HDFS and Oracle DB
    """
    if not rdd.isEmpty():
        # Create df for duplicate handling
        df_max_id = spark.read \
            .format("jdbc") \
            .option("driver", DRIVER) \
            .option("url", URL_TARGET_DB) \
            .option("dbtable", TARGET_DB_TABLE_NAME) \
            .option("user", TARGET_DB_USER_NAME) \
            .option("password", TARGET_DB_USER_PASSWORD) \
            .load()

        max_id = df_max_id.agg({'product_id': 'max'}).collect()[0][0]
        if max_id == None:
            max_id = 0

        rdd = rdd.map(lambda m: parse(m[1]))
        df = sqlContext.createDataFrame(rdd)
        df.createOrReplaceTempView("t")
        result = spark.sql(
            '''select product_id, category_id, brand, description, name, price, last_update_date
                from (select row_number() over (partition by _1 order by _7) as RN,_1 as product_id,_2 as category_id,
                _3 as brand,_4 as description,_5 as name,_6 as price,to_timestamp(_7) as last_update_date
                 from t where _1 > ''' + str(max_id) + ''')
            where RN = 1''')

        count = result.count()

        try:
            # Writing to HDFS
            result.write \
                .format("csv") \
                .mode("append") \
                .option("header", "true") \
                .save(HDFS_OUTPUT_PATH)

            # Writing to Oracle DB
            result.write \
                .format("jdbc") \
                .mode("append") \
                .option("driver", DRIVER) \
                .option("url", URL_TARGET_DB) \
                .option("dbtable", TARGET_DB_TABLE_NAME) \
                .option("user", TARGET_DB_USER_NAME) \
                .option("password", TARGET_DB_USER_PASSWORD) \
                .save()

            write_log('INFO', 'Consumer_dim_products.py', 'main', '{} rows inserted successfully'.format(count))

        except Exception as e:
            print('--> It seems an Error occurred: {}'.format(e))
            write_log('ERROR', 'Consumer_dim_products.py', 'main', str(e)[:1000])
            flag = True
    else:
        ssc.stop()
    return rdd
    "sep": args.file2_sep,
    "quote": args.file2_quote,
    "escape": args.file2_escape,
    "inferSchema": "false",
    "header": args.file2_header,
    "schema": args.file2_schema,
}
print(f"File 2 dataframe parameters:\n{json.dumps(file2_df_params, indent=2)}")
file2_df = spark.read.csv(**file2_df_params)
file2_df.createOrReplaceTempView("file2")

sql = f"""
SELECT
    {args.output_columns}
FROM file1
JOIN file2 ON file1.{args.file1_join_column} = file2.{args.file2_join_column}
"""
print(f"SQL query:\n{sql}")
output = spark.sql(sql)

output.show(50)

if args.output_path:
    output_params = {
        "sep": args.output_sep,
        "header": args.output_header,
        "mode": args.output_mode,
        "path": args.output_path,
    }
    output.write.csv(**output_params)
Пример #11
0
                      numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("query", "StartTime", "RunTime", "features")

numOfClusters = 5
kmeans = KMeans(k=numOfClusters, seed=1)
model = kmeans.fit(rescaledData.select('features'))

transformed = model.transform(rescaledData)
# transformed.show()

transformed.createOrReplaceTempView("df")
timesDf = spark.sql("SELECT StartTime,cast(RunTime as int),prediction FROM df")
sumDf = spark.sql(
    "SELECT StartTime,prediction,count(prediction) FROM df group by StartTime,prediction"
)

# PIVOT
pivotDf1 = sumDf.groupby('StartTime') \
    .pivot('prediction') \
    .max('count(prediction)')
pivotDf1 = pivotDf1.fillna(0)
pivotDf1.createOrReplaceTempView('pivotDf')
pivotDf1.show()
# PIVOT
pivotDf = timesDf.groupby('StartTime') \
    .pivot('prediction') \
    .sum('RunTime')
Пример #12
0
  .withColumn("use", lit("usual")) \
  .withColumn("type", coding_lr) \
  .withColumn("value", col("PatientID")) \
  .withColumn("assigner", struct(col("PatientIDTypeDSC").alias("display"))) \
  .select("EDWPatientID", "use", "type", "value", "assigner")

identifier = id_edwpatient.unionAll(id_mrn).unionAll(id_patient) \
  .groupBy("EDWPatientID").agg(collect_list(struct("use", "type", "value", "assigner")).alias("identifier"))

# COMMAND ----------

# build patient
from pyspark.sql.functions import coalesce, col, concat, expr, lit, split, to_date, when

# work around an oddity with Spark 3 date parsing
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

patient = patient_raw \
  .join(address, ["EDWPatientID"]) \
  .join(telecom, ["EDWPatientID"]) \
  .join(identifier, ["EDWPatientID"]) \
  .withColumn("name", struct( \
    col("PatientFullNM").alias("text"), \
    split(col("PatientFullNM"), ", ")[0].alias("family"), \
    split(split(col("PatientFullNM"), ", ")[1], " ").alias("given"))) \
  .withColumn("resourceType", lit("Patient")) \
  .withColumnRenamed("EDWPatientID", "id") \
  .withColumn("active", lit("true")) \
  .withColumn("gender", when(col("GenderCD") == 1, "female").when(col("GenderCD") == 2, "male").otherwise("unknown")) \
  .withColumn("birthDate", to_date(col("BirthDTS"), "yyyy-MM-dd")) \
  .withColumn("deceasedBoolean", expr("DeathDTS IS NOT NULL")) \
Пример #13
0
def getSourceData(time_param):
    #查询数据语句#
    sql = "select tt.hnuserid, tt.totalscore, rf2.name_deceive,rf2.user_new,rf2.mobile_virtual," \
          "rf2.app_correlation, rf2.card_different, rf2.supply_different, rf2.supply_risk," \
          "rf2.mobile_risk_area,rf2.supply_risk_area,rf2.mobile_supply,rf2.percard,rf2.rvicard,rf2.devNums," \
          "rf2.registerlen,rf2.mActives,rf2.dwActives,rf2.wActives,tt.isblack " \
          "from( select t.hnuserid,t.totalscore,case when hn.status is null then '0' else hn.status end as isblack " \
          "from ( select hnuserid, totalscore from risk_user_portrait.risk_up_featuresScore_total " \
          "where day='" + time_param + "'" \
          ")t " \
          "left outer join risk_business_source.hn_user_black hn on t.hnuserid = hn.hn_user_id " \
          "left outer join risk_business_source.hnuser hu on t.hnuserid = hu.hnuserid " \
          ") tt join( select name_deceive, user_new, mobile_virtual, app_correlation, card_different, " \
          "supply_different, supply_risk, mobile_risk_area, supply_risk_area, mobile_supply, hnuserid " \
          "from risk_user_portrait.risk_up_featuresscore_more " \
          "where day='" + time_param + "'" \
          ") rf on tt.hnuserid = rf.hnuserid " \
          "left join(select hnuserid," \
          "name_deceive, user_new, mobile_virtual, app_correlation, card_different, " \
          "supply_different, supply_risk, mobile_risk_area, supply_risk_area, mobile_supply" \
          "percard,rvicard,devNums,registerlen,mActives,dwActives,wActives " \
          "from risk_user_portrait.risk_up_featuresscore_more2 " \
          "where day='" + time_param + "'" \
          ")rf2 on tt.hnuserid = rf2.hnuserid"

    # The results of SQL queries are themselves DataFrames and support all normal functions.

    #parse the sql statement
    sqlDF = spark.sql(sql)
    #get the hive data
    rows = sqlDF.collect()
    rows_count = len(rows)

    #data conversion
    data = np.zeros((rows_count, 15))
    for i in range(rows_count):
        data[i][0] = rows[i]['hnuserid']
        data[i][1] = rows[i]['totalscore']
        data[i][2] = rows[i]['name_deceive']
        data[i][3] = rows[i]['user_new']
        data[i][4] = rows[i]['mobile_virtual']
        data[i][5] = rows[i]['app_correlation']
        data[i][6] = rows[i]['card_different']
        data[i][7] = rows[i]['supply_different']
        data[i][8] = rows[i]['supply_risk']
        data[i][9] = rows[i]['mobile_risk_area']
        data[i][10] = rows[i]['supply_risk_area']
        data[i][11] = rows[i]['mobile_supply']
        data[i][12] = rows[i]['percard']
        data[i][13] = rows[i]['rvicard']
        data[i][14] = rows[i]['isblack']
    labels = [
        'hnuserid',
        'totalscore',
        'name_deceive',
        'user_new',
        'mobile_virtual',
        'app_correlation',
        'card_different',
        'supply_different',
        'supply_risk',
        'mobile_risk_area',
        'supply_risk_area',
        'mobile_supply',
        'percard',
        'rvicard',
        'isblack',
    ]
    return data, labels
Пример #14
0
def save_data(rdd):
    global flag
    flag = False
    """
    Parsing JSON value in each RDDs
    Creating Spark SQL dataFrame from RDD
    Writing dataFrame to HDFS and Oracle DB
    """
    if not rdd.isEmpty():
        # Create df for duplicate handling
        write_log('INFO', 'Consumer_fct_prod.py', 'main', 'Executing max_id')
        df_max_id = spark.read \
            .format("jdbc") \
            .option("driver", DRIVER) \
            .option("url", URL_TARGET_DB) \
            .option("dbtable", "(SELECT max(ID) ID from " + TARGET_DB_TABLE_NAME + ")") \
            .option("user", TARGET_DB_USER_NAME) \
            .option("password", TARGET_DB_USER_PASSWORD) \
            .load()

        max_id = df_max_id.agg({'ID': 'max'}).collect()[0][0]
        if max_id == None:
            max_id = 0
        write_log('INFO', 'Consumer_fct_prod.py', 'main',
                  'Max id executed successfully max_id = {}'.format(max_id))

        rdd = rdd.map(lambda m: parse(m[1]))
        df_fct_prod = sqlContext.createDataFrame(rdd)
        df_fct_prod.createOrReplaceTempView("t")
        result = spark.sql(
            '''select id, event_id, event_time, product_id, customer_id
        from (select row_number() over (partition by _1 order by _3) as RN, _1 as id,_2 as event_id,
        to_timestamp(_3) as event_time,_4 as product_id,_5 as customer_id
                    from t where _1 > ''' + str(max_id) + ''')
        where RN = 1''')

        count = result.count()

        try:
            write_log('INFO', 'Consumer_fct_prod.py', 'main',
                      'Consumer is inserting {} rows to DB'.format(count))

            # Writing to HDFS
            result.write \
                .format("csv") \
                .mode("append") \
                .option("header", "true") \
                .save(HDFS_OUTPUT_PATH)

            # Writing to Oracle DB
            result.write \
                .format("jdbc") \
                .mode("append") \
                .option("driver", DRIVER) \
                .option("url", URL_TARGET_DB) \
                .option("dbtable", TARGET_DB_TABLE_NAME) \
                .option("user", TARGET_DB_USER_NAME) \
                .option("password", TARGET_DB_USER_PASSWORD) \
                .save()

            write_log('INFO', 'Consumer_fct_prod.py', 'main',
                      '{} rows inserted to DB successfully'.format(count))

        except Exception as e:
            print('--> It seems an Error occurred: {}'.format(e))
            write_log('ERROR', 'Consumer_fct_prod.py', 'main', str(e)[:1000])
            flag = True
    else:
        ssc.stop()
    return rdd