예제 #1
0
def main():
    conf = SparkConf().setAppName('Sentiment Analysis_TfIdf')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    train_inputs = sys.argv[1]
    test_inputs = sys.argv[2]

    schema = StructType([
            StructField('reviewText', StringType(), False),
        StructField('overall', DoubleType(), False),
    ])

    
    read_json = sqlContext.read.json(train_inputs, schema)
    read_json.registerTempTable('read_json')
    lowercase = sqlContext.sql("""
    SELECT lower(reviewText) as reviewText, overall as label
    FROM read_json
    """)
    
    regexTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W")
    
    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    
    hashingTF = HashingTF(numFeatures=10, inputCol="filtered", outputCol="rawFeatures")
    
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    
    lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
    
    pipeline = Pipeline(stages=[regexTokenizer, remover, hashingTF, idf, lr])
    
    paramGrid = ParamGridBuilder() \
    .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \
    .addGrid(lr.regParam, [0.1, 0.01]) \
    .build()

    crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(),
                          numFolds=5)  

    cvModel = crossval.fit(lowercase)
    
    testDF = sqlContext.read.json(test_inputs, schema)
    testDF.registerTempTable('test_data')
    test_data = sqlContext.sql("""
    SELECT lower(reviewText) as reviewText, overall as label
    FROM test_data
    """)
    train_prediction = cvModel.transform(lowercase)
    test_prediction = cvModel.transform(test_data)
    
    evaluator = RegressionEvaluator()

    print "Training dataset RMSE error: %s" %str(evaluator.evaluate(train_prediction))
    print "Testing dataset RMSE: %s" %str(evaluator.evaluate(test_prediction))
예제 #2
0
def func2(rdd):
    '''
        利用之前的知识,
        我们回顾下:
        从rdd创建dataframe
        dataframe创建表
        使用sql
        返回dataframe进行各种保存
    '''
    sqlContext = SQLContext(rdd.context)
    newrdd = rdd.map(lambda line: [line])
    df = sqlContext.createDataFrame(newrdd, StructType([StructField(name="content", dataType=StringType())]))
    df.createOrReplaceTempView("data")
    sqlContext.sql("select content from data").show()
예제 #3
0
def main():
	conf = SparkConf().setAppName('ingest logs')
	sc = SparkContext(conf=conf)
	sqlContext = SQLContext(sc)

	inputs = sys.argv[1] 
	output = sys.argv[2]

	#Reading the input file, and then matching the pattern
	file_data = sc.textFile(inputs)
	linere = re.compile("^(\\S+) - - \\[(\\S+) [+-]\\d+\\] \"[A-Z]+ (\\S+) HTTP/\\d\\.\\d\" \\d+ (\\d+)$")
	
	#Mapping the data after fetching the required values out of the Nasa Web server logs file
	KeyValue = file_data.map(lambda line : linere.split(line)).filter(lambda x : len(x)==6).map(lambda y : (y[1],(dt.datetime.strptime(y[2], '%d/%b/%Y:%H:%M:%S')),y[3],y[-2])).cache()
	
	#Mapping the KeyValue RDD as the required format of 4 columns
	Nasa = KeyValue.map(lambda p: {"host": p[0], "datetime": p[1], "path": p[2], "bytes": long(p[3])})
	
	#Converting Nasa to DataFrame and then registering it as Table
	schemaNasa = sqlContext.createDataFrame(Nasa)
	schemaNasa.registerTempTable("NasaLogs")

	#Writing the data into a parquet file
	schemaNasa.write.format('parquet').save(output)
	
	#Reading the data from Parquet file and then Registering it in Table Format
	parquetdata = sqlContext.read.parquet(output)
	parquetdata.registerTempTable("parquetTable")

	#Firing SQL query to count the total number of bytes transferred using SUM(bytes)
	totalbytes = sqlContext.sql("""
    	SELECT SUM(bytes)
    	FROM parquetTable
	""")
	totalbytes.show()
예제 #4
0
def run_spark():
    spark = SparkSession.builder.master("local").appName("Word Count").getOrCreate()
    df = spark.read.format("csv").option("header", "true").load("hdfs:///project/samples
    sqlContext=SQLContext(spark)
    sqlContext.registerDataFrameAsTable(df, "table1")
    sentences=sqlContext.sql("""SELECT `reviews.rating`,`reviews.text` FROM table1""").rdd
    sentences.collect()
    alist=sentences.map(lambda x:x[0] if x[0] is not None else 0).collect()
    blist=sentences.map(lambda x:get_score(x[1]) if x[1] is not None else 0).collect()
    return alist,blist

def draw_plots(alist,blist)
    alist,blist = run_spark()
    t_plt, = plt.plot(np.arange(1, len(alist)+1), alist, 'r')
    v_plt, = plt.plot(np.arange(1, len(alist)+1), blist)
    plt.title('NLP Emotion Analysis') #
    plt.xlabel('epoch') #
    plt.ylabel('score') #
    plt.legend((t_plt, v_plt), ('rating', 'score')) #
    plt.savefig("result.png")
    with open("x1.txt","w+") as input_:
        input_.write(','.join([str(x) for x in alist]))
        input_.write(','.join([str(x) for x in blist])) 

if __name__=='__main__':
    run_spark()
def main():
	conf = SparkConf().setAppName('reddit relative sql score')
	sc = SparkContext(conf=conf)
	sqlContext = SQLContext(sc)

	inputs = sys.argv[1]
	output = sys.argv[2]
	
	#Defining the Schema
	schema = StructType([
    		StructField('subreddit', StringType(), False),
		StructField('score', IntegerType(), False),
	])

	#Reading the reddit json files
	comments = sqlContext.read.json(inputs, schema)
	
	#Registering the Json data in Table Format and Querying the table to get subreddit names and average scores
	comments.registerTempTable('comments')
	averages = sqlContext.sql("""
   	SELECT subreddit, AVG(score)
    	FROM comments
    	GROUP BY subreddit
	""")

	averages.write.save(output, format='json', mode='overwrite')
예제 #6
0
def run():
    conf = SparkConf().setAppName("word count") \
      .setMaster("local[2]")

    context = SparkContext(conf=conf)
    sqlContext = SQLContext(context)
    context.setLogLevel('ERROR')

    data = [('Song', 25), ('Trump', 22), ('Yong', 20), ('Obama', 26)]
    rdd = context.parallelize(data,
                              2).map(lambda x: Row(name=x[0], age=int(x[1])))
    people = sqlContext.createDataFrame(rdd).cache()

    people.printSchema()
    old_guy = people.orderBy('age', ascending=False).take(1)
    print(old_guy)

    same_old_guy = [
        Row(name=x['name'], age=x['age'], other=1) for x in old_guy
    ]
    print(same_old_guy)

    total = people.groupBy().sum('age').collect()[0][0]
    print('Total age is {}'.format(total))

    people.createTempView('people_table')
    new_people = sqlContext.sql(
        'select name, age from people_table order by age desc limit 1')
    new_people.show()
예제 #7
0
def query2(sc, file_in_name, file_out_name):

    rdd_file_data = sc.textFile(file_in_name)

    data_header = rdd_file_data \
        .filter(lambda l: "datetime" in l)

    cites = weather.gen_city_keys(sc)

    header_position = run2.get_position(data_header)

    data = rdd_file_data \
        .subtract(data_header) \
        .flatMap(lambda line: generate_tuple(header_position, line, cites))

    sqlc = SQLContext(sc)

    df = sqlc.createDataFrame(data)
    #df.show()
    df.createOrReplaceTempView("dati")

    query1 = "SELECT country, year, month, " \
             "cast(min(value) as decimal (10,2)) as my_min, " \
             "cast(max(value) as decimal (10,2)) my_max, " \
             "cast(avg(value) as decimal (10,2)) as my_avg," \
             "cast(stddev(value) as decimal (10,2)) as my_std " \
             "FROM dati " \
             "GROUP BY country, year, month"
    df2 = sqlc.sql(query1).orderBy('dati.country', 'dati.year', 'dati.month')
    df2.show()
    '''
            Save data in HDFS
    '''
    df2.coalesce(1).write.format("json").save(file_out_name)
예제 #8
0
class SparkConnector:
    def __init__(self):
        # findspark.init()
        sc = SparkSession.builder \
         .master("local[*]") \
         .appName('SCA') \
         .config('spark.jars.packages', 'com.datastax.spark:spark-cassandra-connector_2.11:2.3.0') \
         .config('spark.cassandra.connection.host', os.environ['DB_ADDR']) \
         .config('spark.cassandra.auth.username', os.environ['DB_USER']) \
         .config('spark.cassandra.auth.password', os.environ['DB_PASS'])  \
         .config('spark.executor.memory', '15g') \
                              .config('spark.driver.memory','6g') \
         .getOrCreate()
        self.sqlContext = SQLContext(sc)
        self.sqlContext.setConf('spark.sql.shuffle.partitions', '10')

    def submit_sql(self, query):
        return self.sqlContext.sql(query).collect()

    def load_and_get_table_df(self, keys_space_name, table_name):
        table_df = self.sqlContext.read \
         .format("org.apache.spark.sql.cassandra") \
         .options(table=table_name, keyspace=keys_space_name) \
         .load()
        return table_df
예제 #9
0
def main():
    keyspace = sys.argv[1]
    output = sys.argv[2]
    orderkeys = sys.argv[3:]

    cluster_seeds = ['199.60.17.136', '199.60.17.173']
    conf = SparkConf().set('spark.cassandra.connection.host',
                           ','.join(cluster_seeds))
    conf.set('spark.dynamicAllocation.maxExecutors', 20)
    conf.setAppName('tpch orders sql')
    sc = pyspark_cassandra.CassandraSparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    df_for_orders(keyspace, sc, sqlContext)
    df_for_lineitem(keyspace, sc, sqlContext)
    df_for_part(keyspace, sc, sqlContext)



    df_orders = sqlContext.sql("""SELECT o.orderkey, o.totalprice, p.name FROM
                                    orders o
                                    JOIN lineitem l ON (o.orderkey = l.orderkey)
                                    JOIN part p ON (l.partkey = p.partkey)
                                    WHERE o.orderkey in (""" + ",".join(orderkeys) \
                                +")")

    rdd_orders = df_orders.rdd
    rdd_orders = rdd_orders.map(map_key) \
               .reduceByKey(reduce_parts)\
               .sortByKey() \
               .map(map_output).coalesce(1)
    rdd_orders.saveAsTextFile(output)
예제 #10
0
def sample_function(sc: SparkContext):
    schema = StructType([StructField("odd_numbers", IntegerType(), True)])

    print(" Odds number sample")
    big_list = range(10)
    rdd = sc.parallelize(big_list, 2)
    odds = rdd.filter(lambda x: x % 2 != 0)
    odds.foreach(my_print)
    sql_context = SQLContext(sc)
    odd_numbers = sql_context.createDataFrame(odds.map(lambda _: Row(_)),
                                              schema)
    odd_numbers.printSchema()
    odd_numbers.show(truncate=False)
    print("odd_numbers count:" + str(odd_numbers.count()))

    odd_numbers.createOrReplaceTempView("odd_numbers_table")
    sql_context.sql("select * from odd_numbers_table limit 2;").show()

    return (odd_numbers)
예제 #11
0
def load_parquet(database, table, quiet):
    sc = SparkContext()
    if quiet:
        sc = quiet_log(sc)

    sqlContext = SQLContext(sc)
    sqlContext.setConf('spark.sql.parquet.binaryAsString', 'True')
    print(database, table)
    return sqlContext.sql(
        'Select * from parquet.`/user/hive/warehouse/{:s}.db/{:s}`'.format(
            database, table)), sc, sqlContext
예제 #12
0
    def execute(self, sql):
        """
        This method actually executes the sql query

        :param sql: SQL query to be executed
        :return: Newly created PySpark dataframe
        """
        if sql is None:
            raise ValueError("Query is None")
        self.logger.info("# Executing an SQL query")
        sqlContext = SQLContext(self.spark)
        df = sqlContext.sql(sql)
        return df
예제 #13
0
def main():
    input_keyspace = sys.argv[1]
    output_keyspace = sys.argv[2]

    cluster_seeds = ['199.60.17.136', '199.60.17.173']
    conf = SparkConf().set('spark.cassandra.connection.host',
                           ','.join(cluster_seeds))
    conf.set('spark.dynamicAllocation.maxExecutors', 20)
    conf.setAppName('tpch orders denormalize cassandra')
    sc = pyspark_cassandra.CassandraSparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    df_for_part(input_keyspace, sc, sqlContext)
    df_lineitems = df_for_lineitem(input_keyspace, sc, sqlContext)

    df_part = sqlContext.sql("""SELECT l.orderkey, p.name
                                  FROM lineitem l
                                  JOIN part p ON (l.partkey = p.partkey)""")

    df_part_names = rows_to_list(df_part, "orderkey", "name", "names",
                                 sqlContext)

    df_part_names.registerTempTable("part_names")

    df_orders = df_for_orders(input_keyspace, sc, sqlContext)

    df_orders = sqlContext.sql("""SELECT o.*, p.names as part_names
                                    FROM
                                    orders o
                                    JOIN part_names p ON (o.orderkey = p.orderkey) """
                               )

    rdd_orders = df_orders.rdd.map(map_primary_key)

    rdd_orders.saveToCassandra(output_keyspace,
                               'orders_parts',
                               parallelism_level=64)
def main():
	conf = SparkConf().setAppName('artist_career')
	sc = SparkContext(conf=conf)
	assert sc.version >= '1.5.1'
	sqlContext=SQLContext(sc)
	inputs = sys.argv[1]
	output = sys.argv[2]
	customSchema = StructType([StructField('SongNumber', StringType(),False),StructField('SongID', StringType(),False),StructField('AlbumID', StringType(),False),StructField('AlbumName', StringType(),False),StructField('ArtistID', StringType(),False),StructField('ArtistLatitude', StringType(),False),StructField('ArtistLocation', StringType(),False),StructField('ArtistLongitude', StringType(),False),StructField('ArtistName', StringType(),False),StructField('Danceability', StringType(),False),StructField('Duration', StringType(),False),StructField('KeySignature', StringType(),False),StructField('KeySignatureConfidence', StringType(),False),StructField('Tempo', StringType(),False),StructField('TimeSignature', StringType(),False),StructField('TimeSignatureConfidence', StringType(),False),StructField('Title', StringType(),False),StructField('Year', StringType(),False),StructField('Energy', StringType(),False),StructField('ArtistFamiliarity', StringType(),False),StructField('ArtistMbid', StringType(),False),StructField('SongHotttnesss', StringType(),False),StructField('Loudness', StringType(),False),StructField('StartOfFadeOut', StringType(),False),StructField('EndOfFadeIn', StringType(),False),StructField('ModeConfidence', StringType(),False)])

	df= sqlContext.read.format('com.databricks.spark.csv').options(header='true').load(inputs,schema = customSchema)

	df.registerTempTable('artist_data')

	million_song=sqlContext.sql("SELECT SongNumber,SongID,AlbumID,AlbumName,ArtistID,ArtistLatitude,ArtistLocation,ArtistLongitude,ArtistName,Danceability,Duration,KeySignature,KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence,Title,Year,Energy,ArtistFamiliarity,ArtistMbid,SongHotttnesss,Loudness,StartOfFadeOut,EndOfFadeIn,ModeConfidence from artist_data where Year!=0 AND ArtistFamiliarity!='nan'")
	million_song.write.format('parquet').save(output)
예제 #15
0
def main():

    sc = SparkContext("local", "Query 1")

    rawWeather, weatherHeader, cities = run.getRDDFromCSV(
        sc, Constants.WEATHER_DESCRIPTION_FILE)

    weatherDescription = rawWeather \
        .subtract(weatherHeader) \
        .filter(lambda l: re.search('^\d{4}-03|^\d{4}-04|^\d{4}-05', l))  # month filter

    daysOfMonth = weatherDescription \
        .flatMap(lambda line: generateTuple(line, cities))

    sqlc = SQLContext(sc)

    df = sqlc.createDataFrame(daysOfMonth)
    df.show()
    df.createOrReplaceTempView("dati")
    query1 = "SELECT city, year, month, day, sum(sunny) as n_sunny_h FROM dati GROUP BY city, year, month, day"
    df2 = sqlc.sql(query1)
    df2.show()

    #applicazione regola sunny day (75%)
    df2.createOrReplaceTempView("dati")
    query2 = "SELECT city, year, month, day FROM dati where n_sunny_h >13"
    df3 = sqlc.sql(query2)
    df3.show()

    #almeno 15 gg sereno al mese
    df3.createOrReplaceTempView("dati")
    query2 = "SELECT city, year, month, count(*) AS n_day FROM dati GROUP BY year,city, month"
    df4 = sqlc.sql(query2)
    df4.show()

    #filtra n_giorni
    df4.createOrReplaceTempView("dati")
    query2 = "SELECT city, year, month, n_day FROM dati where n_day>=15"
    df5 = sqlc.sql(query2)
    df5.show()

    # filtra n mesi = 3
    df5.createOrReplaceTempView("dati")
    query2 = "SELECT city, year, count(*) AS n_month FROM dati GROUP BY city, year "
    df6 = sqlc.sql(query2)
    df6.show()

    df6.createOrReplaceTempView("dati")
    query2 = "SELECT city, year FROM dati WHERE n_month = 3 "
    df7 = sqlc.sql(query2)
    df7.show()
예제 #16
0
    def run(sparkContext, source_table_name1, source_table_name2, sink_table_name):
        SparkApp.logger.info(sparkContext.appName + "Starting run()")

        sqlContext = SQLContext(sparkContext)
        SparkApp.logger.info(sparkContext.appName + "Starting jdbc read() !")
        df_hl7 = sqlContext.read.jdbc(url=ConfigFramework.getPostgres_URL()
                                        , table=source_table_name1
                                        , properties=ConfigFramework.getPostgres_Properties())
        df_fhir = sqlContext.read.jdbc(url=ConfigFramework.getPostgres_URL()
                                        , table=source_table_name2
                                        , properties=ConfigFramework.getPostgres_Properties())
        SparkApp.logger.info(sparkContext.appName + "End jdbc read() !")
        print(df_hl7.show())
        print(df_fhir.show())

        df_hl7.registerTempTable("lab_obs")
        df_fhir.registerTempTable("medications")

        df_sql_result = sqlContext.sql("SELECT m.patient_id, m.resource_medicationReference_display , m.resource_authoredOn_dt FROM medications m "
                                       "JOIN (SELECT patient_id, "
                                       "CASE WHEN ob_value_noted = 'Notdetected' AND lag(ob_value_noted) "
                                       "OVER (partition by patient_id order by l.ob_dt_cleaned) = 'Detected' "
                                       "THEN 1 ELSE 0  END as qualify "
                                       "FROM lab_obs l) l ON l.patient_id = m.patient_id AND l.qualify = 1")

        # df_sql_result = sqlContext.sql("select m.resource_medicationReference_display, m.patient_id from medications m where "
        #                                "m.resource_authoredOn_dt not in (select l.ob_dt_cleaned from lab_obs l where l.patient_id = m.patient_id and l.ob_value_noted = 'Detected' ) "
        #                                )
                                       # "and "
                                       # "m.resource_authoredOn_dt < any (select l.ob_dt_cleaned from lab_obs l where l.patient_id = m.patient_id and l.ob_value_noted = 'Notdetected' )"
                                       # )
        print(df_sql_result.show())

        SparkApp.logger.info(sparkContext.appName + "Starting jdbc write() !")
        df_sql_result.write.jdbc(url=ConfigFramework.getPostgres_URL(), table=sink_table_name, mode="overwrite"
                       , properties=ConfigFramework.getPostgres_Properties())
        SparkApp.logger.info(sparkContext.appName + "End jdbc write() !")

        SparkApp.logger.info(sparkContext.appName + "Ending run()")
예제 #17
0
def main():
    start = time.time()

    conf = SparkConf().setMaster("local[*]") \
                      .setAppName("secondaryStructureSegmentDemo")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # retrive PDB annotation: Binding affinities (Ki, Kd)
    # group name of the ligand (hetId), and the
    # Enzyme Classification number (ecNo)
    ds = customReportService.getDataset(["Ki", "Kd", "hetId", "ecNo"])

    # Show schema of this dataset
    ds.printSchema()

    # Select structures that either have Ki or Kd value(s) and
    # are protein-serine/threonine kinases (EC 2.7.1.*):

    # A. By using dataset operations
    ds = ds.filter(
        "(Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'")
    ds.show(10)

    # B. by creating a temporary query and running SQL
    ds.createOrReplaceTempView("table")
    ds = sqlContext.sql(
        "SELECT * from table WHERE (Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'"
    )
    ds.show(10)

    end = time.time()

    print("Time: %f  sec." % (end - start))

    sc.stop()
예제 #18
0
    lines = sc.textFile(file_path)

    info = lines.map(lambda lines: lines.split("----")). \
        map(lambda info: Row(email=info[0], username=info[1], realname=info[2],
                             idcard=info[3], password=info[4], phone=info[5]))

    schemaInfo = sqlContext.createDataFrame(info)
    schemaInfo.registerTempTable("information")
    # cache表
    #sqlContext.cacheTable("information")
    #sqlContext.uncacheTable("information")
    """
    :邮箱分析与统计
    """
    email_str = "SELECT analysis_email(email) AS email FROM information"
    emailSQL = sqlContext.sql(email_str)
    # 求总数
    count = emailSQL.count()
    # 分组统计
    emailCollect = emailSQL.groupBy("email").count().collect()
    # email分析结果
    result_email(emailCollect, count)
    """
    :用户名与姓名分析与统计
    """
    # 用户名长度统计
    username_len_str = "SELECT LENGTH(username) AS username_len FROM information"
    usernameSQL = sqlContext.sql(username_len_str)
    usernameLenCollect = usernameSQL.groupBy("username_len").count().collect()
    result_username_len(usernameLenCollect, count)
예제 #19
0
from pyspark import SparkConf, SparkContext, SQLContext

conf = SparkConf()

context = SparkContext(conf=conf)

sql_context = SQLContext(context)

dfmain = sql_context.read.json ("/home/spark/Desktop/reviews_Musical_Instruments.json")		//load data into a dataframe

dfmain.show()											//validation to see whether data has loaded properly

dfmain.createOrReplaceTempView("json_view")							//creating a temp view from the dataframe

dfnew=sql_context.sql("select reviewerID,asin,overall from json_view")				//forming a new dataframe with the desired fields

dfnew.show()											//validation to see whether data has loaded properly

rdd_allreviews = dfnew.select('reviewerID', 'asin', 'overall').rdd				//converting the dataframe into a rdd

print rdd_allreviews.take(10)									//printing the first 10 rows to verify whether data has been loaded properly

training_RDD,test_RDD = rdd_allreviews.randomSplit([8, 2], seed=0L)				//splitting the original rdd into train and test using the randomsplit function.	
    print ("Successfully imported Spark Modules -- `SparkContext, SQLContext`")
    print_horizontal()
except ImportError as e:
    print ("Can not import Spark Modules", e)
    sys.exit(1)

sqlContext = SQLContext(sparkContext=sc)

# Loads parquet file located in AWS S3 into RDD Data Frame
parquetFile = sqlContext.read.parquet("s3://jon-parquet-format/nation.plain.parquet")

# Stores the DataFrame into an "in-memory temporary table"
parquetFile.registerTempTable("parquetFile")

# Run standard SQL queries against temporary table
nations_all_sql = sqlContext.sql("SELECT * FROM parquetFile")

# Print the result set
nations_all = nations_all_sql.map(lambda p: "Country: {0:15} Ipsum Comment: {1}".format(p.name, p.comment_col))

print("All Nations and Comments -- `SELECT * FROM parquetFile`")
print_horizontal()
for nation in nations_all.collect():
    print(nation)

# Use standard SQL to filter
nations_filtered_sql = sqlContext.sql("SELECT name FROM parquetFile WHERE name LIKE '%UNITED%'")

# Print the result set
nations_filtered = nations_filtered_sql.map(lambda p: "Country: {0:20}".format(p.name))
예제 #21
0
sqc = SQLContext(sc)

#1)Create RDD emps by importing that file into Spark.
emps = sc.textFile(spark_root_dir + '/data/emps.txt')
#2)Next create a new RDD emps_fields by transforming the content of every line in RDD emps into
#    a tuple with three individual elements by splitting the lines on commas.
emps_fields = emps.map(lambda line: line.split(',')).map(
    lambda row: tuple(row))
#emps_fields = emps.map(lambda line: line.split(','))

#3)You need to apply "constructor" Row to every tuple in RDD emps_fields
employees = emps_fields.map(
    lambda e: Row(name=e[0], age=int(e[1]), salary=float(e[2])))

#You generate a DataFrame by passing an RDD of Row elements to the method createDataFrame() of class SQLContext.
df = sqc.createDataFrame(employees)
#Show the content of new DataFrame.
df.show()

tmp_table_name = 'tmp_employees'
#Transform this DataFrame into a Temporary Table
df.registerTempTable(tmp_table_name)
#select names of all employees who have a salary greater than 3500
sqc.sql('SELECT * FROM {0} WHERE salary > 3500'.format(tmp_table_name)).show()

#print emps_fields.collect()
#print type(emps_fields.take(1)[0])
#print employees.take(1)

#paragraphA = linesA.flatMap(lambda line: line.split(' ')).map(lambda word: clean(word))
예제 #22
0
    sqlContext = SQLContext(sc)

    # path to hillary/enron avro
    enr = sqlContext.read.format(
        "com.databricks.spark.avro").load(
            "s3n://datasets-396316040607/enron_data/*.avro").repartition(16)
    hil = sqlContext.read.format(
        "com.databricks.spark.avro").load(
            "s3n://datasets-396316040607/hillary/*.avro").repartition(16)

    # register tables
    sqlContext.registerDataFrameAsTable(hil, "hillary")
    sqlContext.registerDataFrameAsTable(enr, "enron")

    # register udf
    sqlContext.registerFunction(
        "getCos", lambda x, y: get_cosine(text_to_vector(x), text_to_vector(y))
    )

    # do the cosine similarity on the text, get the top 1000 matches
    out = sqlContext.sql("SELECT h.author h_auth, e.author e_auth, "
                         "e.contents e_mail, h.contents h_mail, "
                         "getCos(e.contents, h.contents) as cos_sim "
                         "from hillary as h join enron as e order by cos_sim "
                         "desc limit 1000")

    # write back out to s3
    # write back out to s3
    out.save("s3n://datasets-396316040607/cos_sim/", format="json")
예제 #23
0
# Databricks notebook source
from pyspark import SparkContext
from pyspark import SQLContext
from pyspark.sql.functions import *
target_query="SELECT * from databse_name.byod_dbfs_table"
sparkContext = SparkContext.getOrCreate()
sqlContext = SQLContext(sparkContext)
dataframe = sqlContext.sql(target_query)
dataframe.repartition(1).write.format('com.databricks.spark.csv').options(delimiter=",").save("s3n://amgen-edl-acux-aaaa123-bkt/BYOD/", header="true", mode="overwrite")

# COMMAND ----------

print "test2"

# COMMAND ----------

print "test3"
def filterData(record):
  flag = True
  if (int(record[-4])<1) or (record[-2] not in (['1','4'])) or (record[-1] != ''): flag = False
  return flag

if __name__ == '__main__':
  sc = SparkContext(appName = 'CF_prod_in_transaction')
  sqlContext = SQLContext(sc)
  in_file = sc.textFile(sys.argv[1])
  data = in_file.map(oritentData).filter(filterData).map(lambda x: [int(i) for i in x[:-3]])
  Record = Row('customer_id','product_id','invoice_id','units')
  data = data.map(lambda x: Record(*x))
  data = sqlContext.createDataFrame(data)
  sqlContext.registerDataFrameAsTable(data,'table1')
  df = sqlContext.sql('select customer_id, product_id, sum(units) as prod_in_transactions from table1 group by customer_id, product_id')
  df.map(lambda x: ','.join([str(r) for r in x])).saveAsTextFile(sys.argv[2])
  sc.stop()



data_path,header,train_sample,number,support,confidence,lift,k,testing,testing_split,seed,output_path
write = open('test.csv','w')
wrtr = csv.writer(write)

import csv
read = open('arqiva.csv')
for line in read: wrtr.writerow(line)

from e
envelope = Envelope(
예제 #25
0
lines_with_keyword = textFile.filter(lambda line: "@TMobileHelp" in line)

print "Number of tweets with TMobileHelp: " + str(lines_with_keyword.count())
print lines_with_keyword.lookup("text")
#print lines_with_keyword
#schemaTweets = sqlContext.createDataFrame(lines_with_keyword)
#schemaTweets.registerTempTable("tweets")
#row = Row("text")
#lines_with_keyword.map(row).toDF()
#lines_with_keyword.printSchema()
#print tweets.take(5)
#print keyword_onlytext.take(5)


df = sqlContext.read.json(currentFile)
#df.printSchema()
#df_distinctlang = df.select("lang").distinct()
#df_lang = sql.read.json("TwitterLanguages.json")
#df_join =df_distinctlang.join(df_lang, df_distinctlang.lang == df_lang.code).drop(df_lang.code).drop(df_lang.status)
#df_join.show()
df.registerTempTable("df")
df_text = df.select("text")
#df_text.printSchema()
df_sql = sqlContext.sql("SELECT text FROM df WHERE text like '%TMobileHelp%'")


print df_sql.collect()

#df_sql.rdd.map(lambda x: ",".join(map(str, x))).coalesce(1).saveAsTextFile("file.csv")
#df.toPandas().to_csv('mycsv.csv')
예제 #26
0
if __name__ == "__main__":
    file_path = os.path.abspath("../doc/book.txt")
    print file_path

    conf = SparkConf().setAppName("schema_test").setMaster("local")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    lines = sc.textFile(file_path)
    # 切分
    parts = lines.map(lambda lines: lines.split(","))

    # 隐射表间关系(定义表结构)
    book = parts.map(lambda book: Row(name=book[0], author=book[1], price=float(book[2]), publish=book[3]))

    # 转换成schema并注册
    schemaPeople = sqlContext.createDataFrame(book)
    schemaPeople.registerTempTable("book")

    # 定义sqk语句(查询prize在50、60之间的书)
    book = sqlContext.sql("SELECT * FROM book WHERE price > 50.0 AND price < 60 OR name LIKE '%Spark%'")

    # 查询结果进行隐射
    bookMap = book.map(lambda books: (books.name, books.author, books.price, books.publish))

    for book in bookMap.collect():
        print "|Name: " + book[0], "|Author: " + book[1], "|Price: " + str(book[2]), "|Publish: " + book[3] + "|"

    sc.stop()
예제 #27
0
        print "|Time: " + row[0], "|Name: " + row[1], "|Oid: " + row[2], "|Value: " + str(row[3]) + "|"

        his.registerTempTable("his_data_zadd")
        #sqlContext.cacheTable("his_data_zadd")

        #assets = sqlContext.sql("SELECT his.name, his.oid FROM his_data_zadd as his WHERE his.value > 200 AND his.oid < 3000000")
        #sql_str = "SELECT his.name, his.oid FROM (SELECT MAX(temp_t.value), temp_t.name, temp_t.oid FROM his_data_zadd AS temp_t) his"
        #sql_str = "SELECT his.name, his.oid, his.value FROM his_data_zadd AS his ORDER BY his.value DESC LIMIT 10"
        #sql_str = 'SELECT his.name, his.oid FROM his_data_zadd AS his WHERE EXISTS (SELECT MAX(temp_t.value) FROM his_data_zadd AS temp_t)'
        """
        Spark 1.5 does not support subquery.
        """
        #sql_str = 'SELECT MAX(his_t.value) as max_value FROM his_data_zadd AS his_t GROUP BY his_t.name,his_t.oid,his_t.collect_time,his_t.value'
        sql_str = 'SELECT * FROM his_data_zadd his_t ORDER BY his_t.value DESC LIMIT 1'
        #sql_str = 'SELECT * FROM (SELECT his_t.value,his_t.name FROM his_data_zadd his_t WHERE his_t.value > 100)'
        assets = sqlContext.sql(sql_str)

        # 结果存储parquet格式
        #assets.saveAsParquetFile("hdfs://localhost:9000/user/parquet_1")

        assets.show()

        # 查询结果进行隐射
        # assetMap = assets.map(lambda asset: (asset.name, asset.value)).foreachPartition(print_fun)
        #assetMap = assets.map(lambda asset: (asset.name, asset.value))
        #assetMap = assets.map(lambda asset: (asset.max_value))
        assetMap = assets.map(lambda asset: (asset.name, asset.oid, asset.value))
        collect_asset = assetMap.collect()

        print_fun(collect_asset)
예제 #28
0
sparkConf = (SparkConf()
             .set(key="carbon.insert.batch.size", value="1000")
             .set(key="spark.driver.allowMultipleContexts",value="true")
             .set(key="spark.executor.extraJavaOptions", value="-Dwso2_custom_conf_dir=/home/supun/Downloads/wso2das-3.1.0/repository/conf"))

#Create a new SparkContext using the above SparkConf.
sparkCtx = SparkContext(conf=sparkConf)

# Check spark master.
print(sparkConf.get("spark.master"));


###################### Get data from DAS table #####################
sqlCtx = SQLContext(sparkCtx)
sqlCtx.sql('CREATE TEMPORARY TABLE table1 ' +
           'USING org.wso2.carbon.analytics.spark.core.sources.AnalyticsRelationProvider ' +
           'OPTIONS (tenantId "-1234", tableName "IRIS_DATA_STREAM")')

df = sqlCtx.sql("SELECT * FROM table1");



##################### Prepare the data #####################
assembler = VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
                            outputCol="features")

assembledDf = assembler.transform(df)

assembledDf.show()

transformedDf = assembledDf.select(col("class").alias("label"), col("features"))
예제 #29
0
    df = sqlContext.read.parquet(logFile)
    destDF=df.select('logintype','logtype','hosid','suppid','logtime','usermac','gwid').map(lambda x:trimf(x))
    fields = [
        StructField('logintype', StringType(), True),
        StructField('logtype', StringType(), True),
        StructField('hosid', StringType(), True),
        StructField('suppid', StringType(), True),
        StructField('logtime', LongType(), True),
        StructField('usermac', StringType(), True),
        StructField('gwid', StringType(), True)
        ]
    schema = StructType(fields)
    schemaDest = sqlContext.createDataFrame(destDF, schema)
    schemaDest.registerTempTable("loginflowlog")

    sqlContext.registerFunction("todatestr", lambda x:longTime2str(x),StringType())
    sqlContext.registerFunction("trimx", lambda x:trimx(x),StringType())
    midDF = sqlContext.sql("select count(1) userlogintimes,count(distinct(usermac)) userlogincount,hosid,gwid,todatestr(logtime) day from loginflowlog "
                           "where logtype like '5-%-arrive' and gwid!='' group by hosid,gwid,todatestr(logtime)")

    hosiddayList=midDF.rdd.map(lambda x:(x[2],x[3],x[4],x[2],x[3],x[4])).collect()
    resultList=midDF.rdd.collect()

    dao=MysqlDao()
    dao.insertMany('INSERT INTO `bblink_data`.`bblink_data_hos_subject` (`hosid`,`gwid`,`day`)VALUES(%s,%s,%s) ON DUPLICATE KEY UPDATE hosid=%s,gwid=%s,day=%s',hosiddayList)


    dao.insertMany("update `bblink_data`.`bblink_data_hos_subject` set userlogintimes=%s,userlogincount=%s where hosid=%s and gwid=%s and day=%s",resultList);

def main():
     conf = SparkConf().setAppName('Scenario')
     sc = SparkContext(conf=conf)
     assert sc.version >= '1.5.1'
     sqlContext = SQLContext(sc)
     read_parq=sqlContext.read.parquet(inputs).cache()

     read_parq.registerTempTable('data')
     scenario_1=sqlContext.sql("""
     SELECT ArtistName , MAX(ArtistFamiliarity) AS MaxFamiliarity
     FROM data
     where ArtistFamiliarity!='nan' and ArtistLocation is NOT NULL
     GROUP BY ArtistName
     ORDER BY MAX(ArtistFamiliarity) DESC
     LIMIT 100
     """)
     #scenario_1.show()
     scenario_1.registerTempTable('scenario1')

     joined_artist=sqlContext.sql("""
     SELECT A.ArtistName , B.ArtistFamiliarity,B.SongID,B.SongHotttnesss,B.ArtistLocation,B.ArtistLatitude,B.ArtistLongitude
     FROM scenario1 A
     INNER JOIN data B ON A.ArtistName=B.ArtistName
     """)

     #joined_artist.show()
     joined_artist.registerTempTable('joined_artist')

     AvgFamiliarity=sqlContext.sql("""
     SELECT first(ArtistName) AS Artist_Name, MIN(ArtistFamiliarity) AS MinFamiliarity,MAX(ArtistFamiliarity) AS MaxFamiliarity,ArtistLocation AS Location, first(ArtistLatitude) AS Latitude, first(ArtistLongitude) AS Longitude
     FROM joined_artist
     where ArtistFamiliarity!='nan' and ArtistLatitude <> '' and ArtistLatitude is not NULL
     GROUP BY ArtistLocation
     ORDER BY MAX(ArtistFamiliarity) DESC
     """)


     #AvgFamiliarity.show()

     AvgFamiliarity.rdd.map(tuple).coalesce(1).saveAsTextFile(output+'/scenario1')


     scenario_2=sqlContext.sql("""
     SELECT ArtistName , MAX(SongHotttnesss) as MaxHotttness
     FROM data
     where SongHotttnesss!='nan'
     GROUP BY ArtistName
     ORDER BY MAX(SongHotttnesss) DESC
     LIMIT 10
     """)
     scenario_2.registerTempTable('scenario2')

     joined_artist_hotness=sqlContext.sql("""
     SELECT B.Title, A.ArtistName , B.SongHotttnesss,B.ArtistFamiliarity
     FROM scenario2 A
     INNER JOIN data B ON A.ArtistName=B.ArtistName and A.MaxHotttness=B.SongHotttnesss
     """)
     #joined_artist_hotness.show()
     joined_artist_hotness.rdd.map(tuple).coalesce(1).saveAsTextFile(output+'/scenario2')

     #Of a particular artist
     scenario_3=sqlContext.sql("""
     SELECT ArtistName , Year, AVG(ArtistFamiliarity) AS AvgFamiliarity,COUNT(SongID) AS Total_Songs
     FROM data
     where ArtistName='Britney Spears' and Year!=0
     GROUP BY Year,ArtistName
     ORDER BY Year
     """)


     scenario_3.rdd.map(tuple).coalesce(1).saveAsTextFile(output+'/scenario3')

     scenario_4=sqlContext.sql("""
     SELECT ArtistName , MAX(ArtistFamiliarity)-MIN(ArtistFamiliarity) AS Difference
     FROM data
     GROUP BY ArtistName
     """)
     scenario_4.registerTempTable('scenario4')

     top_diff=sqlContext.sql("""
     SELECT ArtistName , MAX(Difference) as MAXDiff
     FROM scenario4
     GROUP BY ArtistName
     ORDER BY MAXDiff DESC
     LIMIT 10
     """)
def main():
    conf = SparkConf().setAppName("climate")
    sc = SparkContext(conf=conf)

    sqlContext = SQLContext(sc)
    climateSchema = StructType(
        [
            StructField("station", StringType(), False),
            StructField("date", IntegerType(), False),
            StructField("element", StringType(), False),
            StructField("value", IntegerType(), True),
            StructField("mflag", StringType(), True),
            StructField("qflag", StringType(), True),
            StructField("sflag", StringType(), True),
            StructField("obstime", StringType(), True),
        ]
    )
    info = sqlContext.read.format("com.databricks.spark.csv").options(header="false").schema(climateSchema).load(inputs)
    info.registerTempTable("info")
    stationinfo = sqlContext.sql("SELECT station, date, element, value, FLOOR(date/10000) as yy FROM info ")
    stationinfo.registerTempTable("stationinfo")
    stationinfo.cache()

    prcpTable = sqlContext.sql("SELECT station, date, value as prcp, yy FROM stationinfo WHERE element='PRCP' ")
    prcpTable.registerTempTable("prcpTable")
    prcpTable.cache()
    # prcpTable.show()

    # create 3 tables that hold the monthly average of min, max temperature and prcp
    yearlyprcp = sqlContext.sql(
        "SELECT station, yy, ROUND(Avg(prcp),0) as avg_prcp FROM prcpTable GROUP BY station, yy "
    )
    yearlyprcp.registerTempTable("prcpMean")
    # yearlyprcp.show()

    # get information about stations from stations.txt

    def getdata(line):
        line = line.split("  ")
        values = [x.strip() for x in line]
        return values

    stations = sc.textFile(input2)
    stations = stations.map(getdata)
    stations = stations.map(lambda (a, b, c): Row(station=a, latitude=float(b), longitude=float(c))).cache()
    stationDF = sqlContext.createDataFrame(stations)
    stationDF.registerTempTable("StationTable")
    stationDF.cache()

    # param = sqlContext.sql("SELECT MAX(latitude) as max_lat, Min(latitude) as min_lat, MAX(longitude) as max_long, Min(longitude) as min_long FROM StationTable")
    # param.show()

    # Join to station file to add latitude and longitude and stationID
    result = (
        stationDF.join(yearlyprcp)
        .where(stationDF.station == yearlyprcp.station)
        .select(yearlyprcp.avg_prcp, yearlyprcp.station, yearlyprcp.yy, stationDF.latitude, stationDF.longitude)
    )

    # save into parquet file
    result.write.format("parquet").save(output)
예제 #32
0
productsFile.write.mode("overwrite").format("parquet").option("path", "<s3 path>").saveAsTable("staging.products")

# Load orders data from S3 into Datafram
ordersFile = sqlContext.read.parquet("<s3 path>")

ordersFile.registerTempTable("orders")

#ordersFile.write.mode("overwrite").saveAsTable("retail.orders")

ordersFile.write.mode("overwrite").format("parquet").option("path", "<s3 path>").saveAsTable("staging.orders")

# Load orders data from S3 into Datafram
customersFile = sqlContext.read.parquet("<s3 path>")

customersFile.registerTempTable("customers")

#ordersFile.write.mode("overwrite").saveAsTable("retail.orders")

customersFile.write.mode("overwrite").format("parquet").option("path", "<s3 path>").saveAsTable("staging.customers")

# Join orders and products to get the sales rollup
sales_breakup_sql = sqlContext.sql("SELECT sum(orders.price) total_sales, products.sku, products.product_category "
                                   " FROM orders join products where orders.sku = products.sku group by products.sku, products.product_category")

#products_all = products_sql.map(lambda p: "Counts: {0} Ipsum Comment: {1}".format(p.name, p.comment_col))
sales_breakup_sql.show(n=2)

# Write output back to s3 under processed
sales_breakup_sql.write.mode('overwrite').format("parquet").option("path", "<s3 path>").saveAsTable("processed.sales")
from pyspark import SQLContext
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("Problem 13")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
employee = sqlContext.read.json("/user/smartlin1/data/employee.json")
# Register a temp table using registerTempTable
employee.registerTempTable("Employee")
# Query and print data
emplyeeInfo = sqlContext.sql("SELECT * FROM Employee")
for row in emplyeeInfo.collect():
    print(row)

예제 #34
0
#setting the configurations for the SparkConf object here
conf = (SparkConf()
         .setMaster("local[4]")
         .setAppName("convert.py")
        .set("spark.executor.memory", "1g"))

#creating the SparkConf object here
sc = SparkContext(conf = conf)

#creating the sqlContext that will be used
sqlContext = SQLContext(sc)

#reading the parquet file

#Change this line to be the directory where the parquet file exists
parquetFile = sqlContext.read.parquet('data/test2') 


parquetFile.registerTempTable("parquetFile")

#Queries are made from the base + command.

#base SELECTS elements of what you are interested from WHERE 
base = "SELECT * FROM parquetFile WHERE"

#command is the query you make.
command = ' ip_len >= 1500'
test = sqlContext.sql(base + command)
print test.show()
예제 #35
0
sc = SparkContext().getOrCreate()
sqlContext = SQLContext(sc)

df_wordgame = sqlContext.read.format("csv").option("header",
                                                   "true").csv('wordgame.csv')

df_wordgame.registerTempTable("wordgame")

print("-----------------------------wordgame---------------------------------")

df_wordgame.show()

# removing duplicates
after_duplicate = sqlContext.sql(
    'select author,word1 as id,word2,source,sourceID from wordgame group by  author,word1,word2,source,sourceID limit 1000'
)
after_duplicate.registerTempTable("newwordgame")

vertices = sqlContext.sql(
    'select author,id,word2,source,sourceID from newwordgame group by  author,id,word2,source,sourceID'
)
edges = sqlContext.sql(
    'select id as src,word2 as dst, source as source from newwordgame group by id,word2,source'
)

# importing graph frames
from graphframes import *
graph = GraphFrame(vertices, edges)
pageRank = graph.pageRank(resetProbability=0.15, maxIter=1)
print("---------------------------vertices----------------------")
예제 #36
0
        root
        |-- lpep_pickup_datetime: string (nullable = true)
        |-- Pickup_longitude: double (nullable = true)
        |-- Pickup_latitude: double (nullable = true)
        |-- Dropoff_longitude: double (nullable = true)
        |-- Dropoff_latitude: double (nullable = true)
        |-- Passenger_count: integer (nullable = true)
        |-- Total_amount: double (nullable = true)
        |-- Trip_type: integer (nullable = true)
    """
    tripsData.registerTempTable("tripsData")

    # The trips whose pickup time is between 2am -- 6am
    GIS_Points = sqlContext.sql("select lpep_pickup_datetime, Pickup_longitude, Pickup_latitude from tripsData " +
                                "where lpep_pickup_datetime like '%2015-__-__ 02:%' "
                                "or lpep_pickup_datetime like '%2015-__-__ 03:%' "
                                "or lpep_pickup_datetime like '%2015-__-__ 04:%' "
                                "or lpep_pickup_datetime like '%2015-__-__ 05:%' "
                                ).cache()
    sqlContext.dropTempTable("tripsData")

    # This is a list
    shapeRecs = nyc_shapefile.shapeRecords()

    def point_in_poly(x, y, poly):

        n = len(poly)
        inside = False

        p1x, p1y = poly[0]
        for i in range(n + 1):
            p2x, p2y = poly[i % n]
if __name__ == "__main__":
    json_path = os.path.abspath("../doc/book.json")
    txt_path = os.path.abspath("../doc/book.txt")

    conf = SparkConf().setAppName("mul_sources").setMaster("local[2]")
    sc = SparkContext(conf=conf)

    sqlContext = SQLContext(sc)

    # json读取并隐射
    json_df = sqlContext.read.json(json_path)
    json_df.registerTempTable("json_book")

    # txt读取并隐射
    lines = sc.textFile(txt_path)
    parts = lines.map(lambda lines: lines.split(","))
    book = parts.map(lambda book: Row(name=book[0], author=book[1], price=float(book[2]), publish=book[3]))
    schemaPeople = sqlContext.createDataFrame(book)
    schemaPeople.registerTempTable("txt_book")

    # sql_book = sqlContext.sql("SELECT * FROM json_book AS jbook LEFT JOIN txt_book AS tbook ON tbook.name=jbook.name")
    sql_book = sqlContext.sql("SELECT * FROM json_book AS jbook , txt_book AS tbook "
                              "WHERE jbook.name=tbook.name ORDER BY tbook.price")

    bookMap = sql_book.map(lambda books: (books.name, books.author, books.price, books.publish, books.place))

    for book in bookMap.collect():
        print book[0], book[1], book[2], book[3], book[4]

    sc.stop()
예제 #38
0
    spark_home = '/opt/cloud/spark'
    os.environ['SPARK_HOME'] = spark_home
    conf = (SparkConf()
            .setMaster(master)
            .setAppName(appName)
            .set("spark.sql.parquet.binaryAsString","true")
            )
    sc = SparkContext(conf = conf)
    sql_context = SQLContext(sc)
    sql_context.registerFunction("to_mac", lambda x: normal_mac(x), StringType())

    parquet_df = sql_context.read.parquet(input)
    sql_context.registerDataFrameAsTable(parquet_df, "loginflowlog")
    #_sql = "select to_mac(upper(usermac)),count(distinct dat) days from loginflowlog group by to_mac(upper(usermac))"
    _sql = "select to_mac(upper(usermac)),count(distinct logtime) days from loginflowlog group by to_mac(upper(usermac))"
    rs_df = sql_context.sql(_sql)
    rs = rs_df.collect()
    logger.info("---->" + str(len(rs)))

    lists = []
    for r in rs:
        usermac = r[0]
        days = r[1]
        t = (usermac,days)
        lists.append(t)
        #logger.debug(t)

    dao = MysqlDao()

    _sql = "TRUNCATE TABLE user_days"
    dao.insert(_sql)
예제 #39
0
파일: MP3_PartC.py 프로젝트: samvalenp/mp8
# Spark SQL - DataFrame API

####
# 3. Filtering (10 points) Count the number of appearances of word 'ATTRIBUTE'
####

schema = StructType([StructField("word", StringType(), True), 
	StructField("count1", IntegerType(), True),
	StructField("count2", IntegerType(), True),
	StructField("count3", IntegerType(), True)])


df = sqlContext.read.csv('gbooks', schema=schema, sep='\t')


#df.select("word", df.word.like("ATTRIBUTE")).show(10)
#df.word.like("ATTRIBUTE").show(10)

df.createOrReplaceTempView("tableTemp")
sqlContext.sql("SELECT count(*) from tableTemp where word ='ATTRIBUTE'").show()
# Spark SQL

# +--------+                                                                      
# |count(1)|
# +--------+
# |     201|
# +--------+


예제 #40
0
from pyspark import SparkContext, SparkConf
from pyspark import SQLContext
import itertools

def print_fun(collect):
    for item in itertools.chain(collect):
        print "|Name: " + item[0], "|Value: " + str(item[1]), "|Attribute: " + item[2]

if __name__ == "__main__":
    conf = SparkConf().setAppName("json_ops").setMaster("local[3]")
    sc = SparkContext(conf=conf)

    sqlContext = SQLContext(sc)

    # 将json格式直接直接parallelize为RDD
    equipmentRDD = sc.parallelize(['{"name":"asset1","value":1,"attribute":"属性1"}',
                                   '{"name":"asset2","value":2,"attribute":"属性2"}',
                                   '{"name":"asset3","value":3,"attribute":"属性3"}'])

    equipment = sqlContext.jsonRDD(equipmentRDD)
    equipment.registerTempTable("equipment")

    assets = sqlContext.sql("SELECT * FROM equipment as eq WHERE eq.value >= 1 AND eq.value <= 2")

    assets.show()

    # 查询结果进行隐射
    assetMap = assets.map(lambda asset: (asset.name, asset.value, asset.attribute)).foreachPartition(print_fun)

    sc.stop()
예제 #41
0
# ## Filtering
#
# ### Select structures that either have Ki or Kd values(s) and are protein-serine/threonine kinases (EC 2.7.1.*)
#
#
# #### A. By using dataset operations

# In[5]:

ds = ds.filter("(Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'")

ds.show(10)

# #### B. By creating a temporary query and running SQL

# In[6]:

ds.createOrReplaceTempView("table")

ds = sqlContext.sql(
    "SELECT * from table WHERE (Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'"
)

ds.show(10)

# ## Terminate Spark

# In[7]:

sc.stop()
conf = SparkConf().setAppName('bag_words')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

with open('clusterFinal.pickle', 'rb') as f:
	cluster=pickle.load(f)


schema = StructType([
    StructField('reviewText', StringType(), False),StructField('overall', FloatType(), False),StructField('reviewTime', StringType(), False)
])

df = sqlContext.read.json(input, schema=schema)
df.registerTempTable('review_table')
sd=sqlContext.sql("""
    SELECT reviewText FROM review_table
""")
fin=sd.rdd.map(lambda x: str(x.reviewText)).map(clean_words)
sparse_vectors=fin.map(get_sparseVector)
time=sqlContext.sql("""
    SELECT reviewTime FROM review_table
""")
time_split=time.rdd.map(lambda x: str(x.reviewTime)).map(lambda line: line.split(', '))
year_list=time_split.map(lambda (x,y):y).collect()

score=sqlContext.sql("""
    SELECT overall FROM review_table
""")
score_list=score.rdd.map(lambda x:str(x.overall)).collect()
sparse_list=sparse_vectors.collect()
zip_list=zip(sparse_list, year_list, score_list)
from pyspark import SQLContext
from pyspark import SparkContext

sc = SparkContext()
sqlContext = SQLContext(sc)

df = sqlContext.read.format('com.databricks.spark.csv').options(
    header='true', inferschema='true').load(
        '/Users/mohiulalamprince/work/python/cricstat/player-info-[0-9]*')
df.registerTempTable('players')
sqlContext.sql("select count(distinct Player) from players").show()
    print("------- ******* Task 1 ******* -------")
    columns = ['id', 'price', 'date', 'zip', 'type', 'new', 'duration', 'PAON',
               'SAON', 'street', 'locality', 'town', 'district', 'county', 'ppd',
               'status']

    rdd = sc.textFile(datadir + "prop-prices.csv")
    df = pyspark_csv.csvToDataFrame(sqlContext, rdd, columns=columns)
    df.registerTempTable("properties")
    df.persist()

    # Task 2: let's do some basic analysis on the data.
    # Find how many records we have per year, and print them out sorted by year.
    print("------- ******* Task 2 ******* -------")
    year_count = sqlContext.sql(
        """select   year(date) as year, count(*) as count
        from     properties
        group by year(date)
        order by year(date)""").collect()
    print(year_count)

    # Task 3: Everyone knows that properties in London are expensive.
    # Find the average property price by county,
    # and print the top 10 most expensive counties
    print("------- ******* Task 3 ******* -------")
    county_prices = sqlContext.sql(
        """select   county, avg(price) as avg_price
        from     properties
        group by county
        order by avg(price) desc
        limit    10""").collect()
    print(county_prices)
예제 #45
0
from pyspark import SparkContext
from pyspark import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

if __name__ == '__main__':

    sc = sparkContext.getOrCreate()

    sqlContext = SQLContext(sc)

    df = spark.read.csv(
        'linkToS3Bucket',
        inferSchema=True,
        header=True,
    )
    cleaningdf = df.dropna(thresh=3)

    cleaningdf = cleaningdf.fillna(0)

    cleaningdf.registerTempTable('df_table')

    sqlContext.sql('select * from df_table').show()

    cleaningdf.write.csv('linkToS3Bucket')
예제 #46
0
    batch_no user_type supp_id
    '''
    user_login = parts.map(lambda p: (p[1].strip(), p[2].strip(),p[17].strip(),p[3].strip(),p[16].strip(),
                                  p[4].strip(),p[5].strip(),p[6].strip(),p[7].strip(),p[8].strip(),
                                  p[9].strip(),p[10].strip(),p[11].strip(),p[12].strip(),p[13].strip(),
                                  p[14].strip(),p[15].strip()))
    schema_string = "id gw_id supp_id user_id user_type " \
                   "user_name login_time logout_time mac ip " \
                   "user_agent download_flow upload_flow os browser " \
                   "ratio batch_no"

    fields = [StructField(field_name, StringType(), True) for field_name in schema_string.split(' ')]
    schema = StructType(fields)

    df = sql_context.createDataFrame(user_login, schema)
    df.registerTempTable("tb_user_login_info")

    #_sql="select distinct mac,gw_id,'%s' as day,'1' as flag from tb_user_login_info" % date
    _sql="select distinct user_name,gw_id,'%s' as day,'1' as flag from tb_user_login_info" % date
    rs = sql_context.sql(_sql)
    re_rdd = rs.map(lambda r:(r.mac+"_"+r.gw_id,r.day+sep+r.flag))\
        .reduceByKey(lambda vs:vs[0],1)

    list =[]
    for t in re_rdd.collect():
        line = t[0]+sep+t[1]
        list.append(line)

    BaseService()._write_file(list, output)

    sc.stop()
예제 #47
0
# callDf.toPandas().to_csv('time.csv', index=False)

# convert string to timestamp
# method 1
# callDf1 = callDf.withColumn("start_time_new", callDf['start_time'].cast(TimestampType()))

# convert string to timestamp
# method 2
# callDf_time = callDf.select('*', to_timestamp('start_time', 'HH:mm:ss').cast(TimestampType()).alias('new_start_time'))
# callDf_time = callDf_time.select('*', to_timestamp('end_time', 'HH:mm:ss').cast(TimestampType()).alias('new_end_time'))
# callDf_time.show()

# 每日平均通话次数
callNumber = sparksql.sql(
    "SELECT calling_nbr, round(COUNT(Call.calling_nbr)/29, 2) AS num FROM Call GROUP BY calling_nbr"
)
callNumber.toPandas().to_csv('../myresult/callNumber.csv', index=False)

# 不同通话类型下各个运营商的数量占比
# 本地
local = sparksql.sql(
    "SELECT called_optr, round(COUNT(Call.called_nbr) * 1.000/(select COUNT(Call.called_nbr) from Call WHERE call_type == 1) ,3) AS num FROM Call WHERE call_type == 1 GROUP BY called_optr"
)
local.toPandas().to_csv('../myresult/local.csv', index=False)

# 长途
longDistance = sparksql.sql(
    "SELECT called_optr, round(COUNT(Call.called_nbr) * 1.000/(select COUNT(Call.called_nbr) from Call WHERE call_type == 2) ,3) AS num FROM Call WHERE call_type == 2 GROUP BY called_optr"
)
longDistance.toPandas().to_csv('../myresult/longDistance.csv', index=False)
예제 #48
0
        {'uid': '3', 'adid': 'c','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823550766},
        {'uid': '4', 'adid': 'd','guuid':'bb','guuidctime':1,'url':'','referer':'','hosid':'133','gwid':'','ua':'','ip':'','createtime':1450823268766},
    ]).registerAsTable("adload")
    _adPlayDF=sqlContext.createDataFrame([
        {'uid': '1', 'adid': 'a','guuid':'aa','createtime':1450823568766},
        {'uid': '2', 'adid': 'b','guuid':'aa','createtime':1450823569766},
        {'uid': '4', 'adid': 'd','guuid':'bb','createtime':1450823268766},
    ]).registerAsTable("adplay")
    _adClickDF =sqlContext.createDataFrame([
        {'uid': '1', 'adid': 'a','guuid':'aa','createtime':1450823580766},
    ]).registerAsTable("adclick")
    '''
    sqlContext.registerFunction("dateformat", lambda x:longTime2str(x),StringType())

    adLoadDf=sqlContext.sql('select hosid,dateformat(createtime) day,adid,count(guuid) pv,count(distinct guuid) uv '
                            'from adload where createtime is not null and dateformat(createtime)=%s '
                            'group by adid,hosid,dateformat(createtime)' % (lastdate)).registerAsTable("radload")

    adPlayDf=sqlContext.sql('select gh.hosid,dateformat(ap.createtime) day,adid,count(ap.guuid) pv,count(distinct ap.guuid) uv '
                            'from adplay ap left join ghid gh on ap.guuid=gh.guuid where dateformat(ap.createtime)=%s '
                            'group by ap.adid,gh.hosid,dateformat(ap.createtime)' % (lastdate)).registerAsTable("radplay")

    # sqlContext.sql('select sum(pv) from radplay').foreach(printx)
    adClick=sqlContext.sql('select gh.hosid,dateformat(ac.createtime) day,ac.adid,count(ac.guuid) pv,count(distinct ac.guuid) uv '
                            'from adclick ac left join ghid gh on ac.guuid=gh.guuid where dateformat(ac.createtime)=%s '
                            'group by ac.adid,gh.hosid,dateformat(ac.createtime)' % (lastdate)).registerAsTable("radclick")



    _df=sqlContext.sql('select A.hosid,A.day,A.adid,A.pv,A.uv,B.pv,B.uv,C.pv,C.uv from radload A '
                       'left join radplay B on A.hosid=B.hosid and A.adid=B.adid '
            payload['data']=[]
            if(len(pub_data[key]['data'])>batch_size):
                total_len=len(pub_data[key]['data'])
                for index in range(0,total_len/batch_size,batch_size):
                    if(index+batch_size>total_len):
                        payload['data']=pub_data[key]['data'][index:total_len-index]
                    else:
                        payload['data']=pub_data[key]['data'][index:index+batch_size]
                pushToRedis(payload)
            else:
                
                payload['data']=pub_data[key]['data']
                pushToRedis(payload)
                    
    # cerating publisher realted data from daily data 
    pub_dfp_df=sqlContext.sql('select publisher_id,id5 as user_identifier, collect_list(device_finger_print) as dfps , collect_list(c_dfp) as dfp_counts from (select publisher_id ,id5, device_finger_print ,count(device_finger_print) as c_dfp  from testtable where id5!="" group by publisher_id,id5,device_finger_print)  group by publisher_id,id5')
    pub_ip_df=sqlContext.sql('select publisher_id,id5 as user_identifier, collect_list(ip) as ips , collect_list(c_ip) as ip_counts from (select publisher_id ,id5, ip ,count(ip) as c_ip  from testtable where id5!="" group by publisher_id,id5,ip) group by publisher_id,id5')
    pub_session_df=sqlContext.sql('select publisher_id,id5 as user_identifier, collect_list(sessionId) as sessions , collect_list(c_session) as session_counts from (select publisher_id ,id5, sessionId ,count(sessionId) as c_session  from testtable where id5!="" group by publisher_id,id5,sessionId) group by publisher_id,id5')
    pub_df=pub_dfp_df.join(pub_ip_df,['publisher_id','user_identifier'],'outer')
    pub_df=pub_df.join(pub_session_df,['publisher_id','user_identifier'],'outer')
    pub_daily_df=sqlContext.read.json(daily_fetch_path)

    if(len(pub_daily_df.columns)==0):
        for dtype in pub_df.dtypes:
            pub_daily_df=pub_daily_df.withColumn(dtype[0],lit(None).cast(dtype[1]))
            
    pub_daily_df=pub_daily_df.select(pub_df.columns)
    pub_df=pub_df.union(pub_daily_df)


    pub_daily_df=pub_daily_df.select(pub_df.columns)
예제 #50
0
import os
import sys

from pyspark import SQLContext
from pyspark import SparkContext

#os.environ["SPARK_HOME"] = "/opt/spark-1.6.1-bin-hadoop2.6"
#os.environ["HADOOP_HOME"] = "/opt/hadoop"
#os.environ["HADOOP_PREFIX"] = "/opt/hadoop"

#os.environ["HIVE_HOME"] = "/opt/hive"


sc = SparkContext('local[1]')
sql_context = SQLContext(sc)
sql_context.setConf( "spark.sql.shuffle.partitions", "1")
sql_context.sql(""" use fex_test """)
예제 #51
0
class DataHandler:
    def __init__(self):
        self.conf = (SparkConf()
                     .setAppName("BandCard")
                     .set("spark.cores.max", "2")
                     .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar'))
        self.sc = SparkContext(conf=self.conf)
        self.sqlctx = SQLContext(self.sc)

        self.mysql_helper = MySQLHelper('core', host='10.9.29.212')

    def load_from_mysql(self, table, database='core'):
        url = "jdbc:mysql://10.9.29.212:3306/%s?user=root&characterEncoding=UTF-8" % database
        df = self.sqlctx.read.format("jdbc").options(url=url, dbtable=table, driver="com.mysql.jdbc.Driver").load()
        return df




    def prepare_life_cycle(self, year, season):
        '''
        准备生命周期数据
        从t_CMMS_ASSLIB_ASSET中获取每日AUM数据
        prepare data

        saum1 (last season sum aum)
        saum2 (current season sum aum)
        aum_now
        account_age (months)
        last_tr_date (days)

        :param year:
        :param season: 1,2,3,4
        :return:
        '''

        # 计算月份
        print('----------------------生命周期-Start----------------------')

        print('开始准备生命周期数据...')
        print('开始计算月份')

        if season == 1:
            # date1 当前季度月份
            date1 = [str(year) + '-01', str(year) + '-02', str(year) + '-03']

            # date2 上一季月份
            date2 = [str(year - 1) + '-10', str(year - 1) + '-11', str(year - 1) + '-12']

        elif season == 4:
            date1 = [str(year) + '-10', str(year) + '-11', str(year) + '-12']
            date2 = [str(year) + '-07', str(year) + '-08', str(year) + '-9']

        else:
            date1 = [str(year) + '-0' + str(3 * season - 2), str(year) + '-0' + str(3 * season - 1),
                     str(year) + '-0' + str(3 * season)]
            date2 = [str(year) + '-0' + str(3 * season - 5), str(year) + '-0' + str(3 * season - 4),
                     str(year) + '-0' + str(3 * season - 3)]

        print('当前季度月份 new:', date1)
        print('上一季度月份 old:', date2)

        # 加载AUM表
        aum = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').cache()

        # 拼接每季度三个月断数据
        season_new = aum.filter(aum.STAT_DAT == date1[0]).unionAll(aum.filter(aum.STAT_DAT == date1[1])).unionAll(
            aum.filter(aum.STAT_DAT == date1[2]))
        season_old = aum.filter(aum.STAT_DAT == date2[0]).unionAll(aum.filter(aum.STAT_DAT == date2[1])).unionAll(
            aum.filter(aum.STAT_DAT == date2[2]))

        # 计算每季度AUM
        aum_season_old = season_old.select('CUST_NO', season_old.AUM.alias('AUM1')).groupBy('CUST_NO').sum('AUM1')
        aum_season_new = season_new.select('CUST_NO', season_new.AUM.alias('AUM2')).groupBy('CUST_NO').sum('AUM2')

        # 两个季度进行外联接
        '''
        +-----------+---------+---------+
        |    CUST_NO|sum(AUM2)|sum(AUM1)|
        +-----------+---------+---------+
        |81005329523|     null|294844.59|
        |81011793167|     null|   365.20|
        |81015319088|     null|  9640.96|
        +-----------+---------+---------+
        '''
        union_season = aum_season_old.join(aum_season_new, 'CUST_NO', 'outer')

        # 筛选当前AUM
        temp_result = aum.select('CUST_NO', 'AUM', 'STAT_DAT').groupBy('CUST_NO', 'STAT_DAT').sum('AUM').sort(
            'CUST_NO').sort(aum.STAT_DAT.desc())
        temp_result.select('CUST_NO', temp_result['sum(AUM)'].alias('AUM'), 'STAT_DAT').registerTempTable('group_in')

        aum_now_sql = "select CUST_NO,first(AUM) as AUM_NOW from group_in group by CUST_NO"

        aum_now = self.sqlctx.sql(aum_now_sql)
        # 清除缓存表
        self.sqlctx.dropTempTable('group_in')

        # 联合
        union_season_aumnow = union_season.join(aum_now, 'CUST_NO', 'outer')

        # 计算用户开户至今时间(months)
        # 载入账户表
        account = self.load_from_mysql('t_CMMS_ACCOUNT_LIST').cache()
        account.select('CUST_NO', 'OPEN_DAT').registerTempTable('account')
        account_age_aql = "select  CUST_NO, first(ACCOUNT_AGE) as ACCOUNT_AGE  from " \
                          "(select CUST_NO, round(datediff(now(), OPEN_DAT) / 30) as ACCOUNT_AGE " \
                          "from account order by CUST_NO, ACCOUNT_AGE desc ) as t group by CUST_NO"

        account_age = self.sqlctx.sql(account_age_aql)

        # calculate last tran date
        account_1 = account.select('CUST_NO', 'ACC_NO15')
        detail = self.load_from_mysql('t_CMMS_ACCOUNT_DETAIL').select('ACC_NO15', 'TRAN_DAT')
        a_d = account_1.join(detail, 'ACC_NO15', 'outer')
        a_d.filter(a_d.CUST_NO != '').registerTempTable('adtable')

        last_tr_date_sql = "select CUST_NO,first(TRAN_DAT) as LAST_TR_DATE from (select CUST_NO,TRAN_DAT from adtable order by TRAN_DAT desc) as t group by CUST_NO"

        last_tr_date = self.sqlctx.sql(last_tr_date_sql)

        # 联合 season   aum_now    account_age     last_tr_date
        unions = union_season_aumnow.join(account_age, 'CUST_NO', 'outer').join(last_tr_date, 'CUST_NO', 'outer')

        # 清除缓存表
        self.sqlctx.dropTempTable('account')
        self.sqlctx.dropTempTable('adtable')
        self.sqlctx.clearCache()

        # 结果插入表
        print('结果插入临时表:t_CMMS_TEMP_LIFECYCLE...')
        insert_lifecycle_sql = "replace into t_CMMS_TEMP_LIFECYCLE(CUST_NO,SAUM1,SAUM2,INCREASE,ACCOUNT_AGE,AUM_NOW,LAST_TR_DATE) values(%s,%s,%s,%s,%s,%s,%s)"

        # 缓冲区
        temp = []
        for row in unions.collect():
            row_dic = row.asDict()

            if len(temp) >= 1000:  # 批量写入数据库
                self.mysql_helper.executemany(insert_lifecycle_sql, temp)
                temp.clear()

            # 加载数据到缓冲区

            try:
                # 计算增长率
                increase = (row_dic['sum(AUM2)'] - row_dic['sum(AUM1)']) / row_dic['sum(AUM1)']
            except Exception:
                increase = 0

            # 计算开户时长(月份数) 若无则视为6个月以上
            if row_dic['ACCOUNT_AGE'] is None:
                row_dic['ACCOUNT_AGE'] = 7

            # 最后交易日期
            ltd = row_dic['LAST_TR_DATE']
            if ltd is not None:
                try:
                    ltd = datetime.datetime.strptime(ltd, '%Y-%m-%d')
                except Exception:
                    ltd = ltd[:4] + '-' + ltd[4:6] + '-' + ltd[6:]
                    ltd = datetime.datetime.strptime(ltd, '%Y-%m-%d')

                days = (datetime.datetime.now() - ltd).days
            else:
                days = 366

            temp.append((row_dic['CUST_NO'], row_dic['sum(AUM1)'], row_dic['sum(AUM2)'], increase,
                         row_dic['ACCOUNT_AGE'], row_dic['AUM_NOW'], days))

        if len(temp) != 0:
            self.mysql_helper.executemany(insert_lifecycle_sql, temp)
            temp.clear()








    def calculate_life_cycle(self):

        '''
        根据AUM变化情况计算生命周期阶段
        calculate life cycle period
        :return:
        '''

        print('开始计算生命周期...')
        life_cycle = self.load_from_mysql('t_CMMS_TEMP_LIFECYCLE').cache()

        def clcmap(line):
            cust_no = line['CUST_NO']
            account_age = line['ACCOUNT_AGE']
            last_tr_date = line['LAST_TR_DATE']
            aum_now = line['AUM_NOW']
            increase = line['INCREASE']

            period = 0
            if aum_now is None:
                period = 9  # 未知
            elif aum_now < 1000 and last_tr_date > 365:
                period = 3  # 流失期
            else:
                if increase > 20 or account_age < 6:
                    period = 0  # 成长期
                elif increase >= -20 and increase <= 20:
                    period = 1  # 成熟期
                else:
                    period = 2  # 稳定期

            return period, cust_no


        map_result = life_cycle.map(clcmap).collect()

        # clear the life_cycle cache
        self.sqlctx.clearCache()

        temp = []
        print('结果更新到临时表:t_CMMS_TEMP_LIFECYCLE...')
        update_life_period_sql = "update t_CMMS_TEMP_LIFECYCLE set PERIOD = %s where CUST_NO = %s"
        for row in map_result:

            if len(temp) >= 1000:
                self.mysql_helper.executemany(update_life_period_sql, temp)
                temp.clear()
            temp.append(row)

        if len(temp) != 1000:
            self.mysql_helper.executemany(update_life_period_sql, temp)
            temp.clear()







    def lifecycle_to_real_table(self, year, season):
        '''
        将生命周期数据写入正式表中
        put life_cycle tmp table to real table
        :return:
        '''

        print('开始将生命周期数据写入正式表中...')
        life_cycle = self.load_from_mysql('t_CMMS_TEMP_LIFECYCLE').select('CUST_NO', 'PERIOD')
        cust_info = self.load_from_mysql('t_CMMS_INFO_CUSTOMER').select('CUST_NO', 'CUST_ID', 'CUST_NAM')

        union = life_cycle.join(cust_info, 'CUST_NO', 'left_outer').cache()

        temp = []
        sql = "replace into t_CMMS_ANALYSE_LIFE(CUST_NO,CUST_ID,CUST_NM,LIFE_CYC,QUARTER,UPDATE_TIME) values(%s,%s,%s,%s,%s,now())"
        quarter = str(year) + '-' + str(season)
        for row in union.collect():

            if len(temp) >= 1000:
                self.mysql_helper.executemany(sql, temp)
                temp.clear()

            cust_id = row['CUST_ID'] if row['CUST_ID'] is not None else '0'
            temp.append((row['CUST_NO'], cust_id, row['CUST_NAM'], row['PERIOD'], quarter))

        if len(temp) != 1000:
            self.mysql_helper.executemany(sql, temp)
            temp.clear()

        self.sqlctx.clearCache()





    def run_life_cycle(self,year,season):
        '''
        运行完整的生命周期流程
        1 准备生命周期数据,计算AUM及其变化幅度
        2 根据变化幅度计算生命周期阶段
        3 将数据从缓存表放到实际表
        :param year:
        :param season:
        :return:
        '''
        self.prepare_life_cycle(year,season)
        self.calculate_life_cycle()
        self.lifecycle_to_real_table(year,season)




#------------------------------------------------------------------------生命周期结束------------------------------------------------------------------------#














    def customer_value(self, year, half_year):
        '''
        计算客户价值
        calculate customer value
        :param year: which year to calculate
        :param half_year: 0 for month 1-6 , 1 for month 7-12
        :return:
        '''

        print('---------------------------客户价值-Start--------------------------')
        cust_info = self.load_from_mysql('t_CMMS_INFO_CUSTOMER').select('CUST_NO', 'CUST_ID', 'CUST_NAM').cache()
        aum = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').select('CUST_NO', 'STAT_DAT', 'AUM', 'ASS_TYPE').cache()

        base = half_year * 6

        aum_slot_filter = None

        for i in range(1, 7):
            i = base + i
            if i < 10:
                i = '0' + str(i)
            else:
                i = str(i)
            slot = str(year) + '-' + i

            slot_filter = aum.filter(aum.STAT_DAT == slot)
            if aum_slot_filter is None:
                aum_slot_filter = slot_filter
            else:
                aum_slot_filter = aum_slot_filter.unionAll(slot_filter)

        # CUST_NO sum(AUM)
        huoqi_aum = aum_slot_filter.select('CUST_NO', 'ASS_TYPE', aum_slot_filter['AUM'].alias('AUM_HQ')).filter(
            aum_slot_filter.ASS_TYPE == '1').groupBy('CUST_NO').sum('AUM_HQ')
        dingqi_aum = aum_slot_filter.select('CUST_NO', 'ASS_TYPE', (aum_slot_filter.AUM * 0.8).alias('AUM_DQ')).filter(
            aum_slot_filter.ASS_TYPE == '2').groupBy('CUST_NO').sum('AUM_DQ')

        # 定期活期已计算好,sum(AUM_HQ),sum(AUM_DQ)
        j = huoqi_aum.join(dingqi_aum, 'CUST_NO', 'outer')
        # j.show()


        # 清除原有数据
        self.mysql_helper.execute('truncate core.t_CMMS_ANALYSE_VALUE')

        # 开始联合其他表
        all_col = j.join(cust_info, 'CUST_NO', 'outer')

        print(j.count(), cust_info.count())

        # all_col.show()

        #根据客户价值计算客户等级
        def calculate_rank(value):
            if value < 1000:
                return 0
            elif value < 10000:
                return 1
            elif value < 100000:
                return 2
            elif value < 500000:
                return 3
            elif value < 2000000:
                return 4
            elif value < 5000000:
                return 5
            else:
                return 6

        temp = []
        print('将数据replace到正式表...')
        update_value_sql = "replace into t_CMMS_ANALYSE_VALUE(CUST_ID,CUST_NO,CUST_NM,CUST_VALUE,CUST_RANK,SLOT,UPDATE_TIME) values(%s,%s,%s,%s,%s,%s,now())"
        for row in all_col.collect():

            if len(temp) >= 1000:
                self.mysql_helper.executemany(update_value_sql, temp)
                temp.clear()

            val_dq = row['sum(AUM_DQ)'] if row['sum(AUM_DQ)'] is not None else 0
            val_hq = row['sum(AUM_HQ)'] if row['sum(AUM_HQ)'] is not None else 0

            cust_val = float(val_dq) + float(val_hq)

            cust_rank = calculate_rank(cust_val)

            slot = str(year) + '-' + str(half_year)
            cust_id = row['CUST_ID'] if row['CUST_ID'] is not None else 1
            temp.append((cust_id, row['CUST_NO'], row['CUST_NAM'], cust_val, cust_rank, slot))

        if len(temp) != 1000:
            self.mysql_helper.executemany(update_value_sql, temp)
            temp.clear()





    def aum_total(self):
        '''
        计算AUM总和
        data for t_CMMS_ASSLIB_ASSTOT
        :return:
        '''
        print('---------------------------总资产-Start--------------------------')
        # TODO t_CMMS_ASSLIB_ASSET_c 要改成正式表t_CMMS_ASSLIB_ASSET
        df_asset = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').select('CUST_NO', 'CUST_ID', 'STAT_DAT', 'AUM', 'CUR',
                                                                      'ACC_NAM').cache()
        # print(df_asset.count(), df_asset.columns)

        other_col = df_asset.select('CUST_NO', 'CUST_ID', 'CUR', 'ACC_NAM').distinct()
        # print(other_col.count(),other_col.columns)

        aum = df_asset.select('CUST_NO', 'STAT_DAT', 'AUM')
        # print(aum.count(), aum.columns)

        aum = aum.select('CUST_NO', 'STAT_DAT', 'AUM').groupBy(['CUST_NO', 'STAT_DAT']).sum('AUM').sort(
            ['CUST_NO', aum.STAT_DAT.desc()]) \
            .groupBy('CUST_NO').agg({'sum(AUM)': 'first', 'STAT_DAT': 'first'})
        # print(aum.count(), aum.columns)


        total = aum.select('CUST_NO', aum['first(sum(AUM))'].alias('AUM'), aum['first(STAT_DAT)'].alias('STAT_DAT')). \
            join(other_col, 'CUST_NO', 'left_outer').distinct()

        # total.filter(total.STAT_DAT == '2016-06-') .show()

        # prepare params
        def list_map(line):
            return line['CUST_ID'], line['CUST_NO'], line['ACC_NAM'], line['STAT_DAT'], line['CUR'], line['AUM']

        df = total.map(list_map)

        # clear old data
        self.mysql_helper.execute('truncate t_CMMS_ASSLIB_ASSTOT')
        sql = "insert into t_CMMS_ASSLIB_ASSTOT(CUST_ID,CUST_NO,ACC_NAM,STAT_DAT,CUR,AUM) values(%s,%s,%s,%s,%s,%s)"

        # execute sql
        self.mysql_helper.batch_operate(sql, df, 100)

    def debt_total(self):
        '''
        prepare data for total debt
        :return:
        '''

        print('---------------------------总负债-Start--------------------------')
        df_debt = self.load_from_mysql('t_CMMS_ASSLIB_DEBT').select('LOAN_ACC', 'CUST_NO', 'CUST_ID', 'CUST_NAM',
                                                                    'BAL_AMT', 'GRANT_AMT', 'CUR')
        df_debt = df_debt.filter(df_debt.LOAN_ACC != '')

        df_sum = df_debt.groupBy('CUST_NO').sum('GRANT_AMT', 'BAL_AMT')
        df_other = df_debt.groupBy('CUST_NO').agg({'CUST_ID': 'first', 'CUST_NAM': 'first', 'CUR': 'first'})

        df_total = df_sum.join(df_other, 'CUST_NO', 'left_outer').distinct()

        stat_dat = datetime.datetime.now().strftime('%Y%m%d')

        def m(line):
            return line['CUST_NO'], line['first(CUST_ID)'], line['first(CUST_NAM)'], line['first(CUR)'], line[
                'sum(GRANT_AMT)'], line['sum(BAL_AMT)'], stat_dat

        df = df_total.map(m)

        sql = "replace into t_CMMS_ASSLIB_DEBTOT(CUST_NO,CUST_ID,ACC_NAM,CUR,LOAN_AMT,BAL_AMT,STAT_DAT) values(%s,%s,%s,%s,%s,%s,%s)"

        self.mysql_helper.batch_operate(sql, df)




    def  run(self):
        # 生命周期 年份 季度1,2,3,4
        dh.run_life_cycle(2016, 2)

        # 客户价值 上半年:0,下半年:1
        dh.customer_value(2016, 0)

        # 总资产
        dh.aum_total()

        # 总负债
        dh.debt_total()
예제 #52
0
    info = lines.map(lambda lines: lines.split("----")). \
        map(lambda info: Row(email=info[0], username=info[1], realname=info[2],
                             idcard=info[3], password=info[4], phone=info[5]))

    schemaInfo = sqlContext.createDataFrame(info)
    schemaInfo.registerTempTable("information")
    # cache表
    #sqlContext.cacheTable("information")
    #sqlContext.uncacheTable("information")

    """
    :邮箱分析与统计
    """
    email_str = "SELECT analysis_email(email) AS email FROM information"
    emailSQL = sqlContext.sql(email_str)
    # 求总数
    count = emailSQL.count()
    # 分组统计
    emailCollect = emailSQL.groupBy("email").count().collect()
    # email分析结果
    result_email(emailCollect, count)

    """
    :用户名与姓名分析与统计
    """
    # 用户名长度统计
    username_len_str = "SELECT LENGTH(username) AS username_len FROM information"
    usernameSQL = sqlContext.sql(username_len_str)
    usernameLenCollect = usernameSQL.groupBy("username_len").count().collect()
    result_username_len(usernameLenCollect, count)
예제 #53
0
from pyspark import SparkConf, SparkContext, SQLContext
import sys

inputfile = sys.argv[2]
outputfile = sys.argv[3]

masterip = sys.argv[1]
conf = SparkConf().setMaster(masterip).setAppName("TableData")
sc = SQLContext(SparkContext(conf = conf))

data = sc.read.csv(inputfile, header='true')
data.createGlobalTempView('Table')

sc.sql('''SELECT * FROM global_temp.Table ORDER BY cca2, timestamp''').coalesce(1).write.csv(outputfile)
        StructField('suppid', StringType(), True),
        StructField('logtime', LongType(), True),
        StructField('usermac', StringType(), True)
    ]
    schema = StructType(fields)

    rdd1 = rdd.map(convert_logtype).filter(lambda tup: tup != None)
    # rdd1.foreach(printx)
    # sc.stop()

    ret_df = sqlContext.createDataFrame(rdd1, schema)
    ret_df.registerTempTable("loginflowlog_overall")
    _sql = "SELECT count(usermac) pv,count(distinct usermac) uv,logtype " \
           "from loginflowlog_overall " \
           "group by logtype"
    rs_df = sqlContext.sql(_sql)

    service = LoginflowlogMysqlService()
    ret_overall_list = service.getRetOverall(rs_df.collect(), day)
    _sql_delete = "delete from login_flow_global_count where date ='%s'" % day
    _sql_insert = "insert into login_flow_global_count(date," \
                  "prelogin_num,prelogin_pnum,login_num,login_pnum," \
                  "login_click_num,login_click_pnum,forward_num,forward_pnum," \
                  "preArrive_num,preArrive_pnum,arrive_num,arrive_pnum) " \
                  "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
    service.write_mysql(ret_overall_list, _sql_delete, _sql_insert)
    logger.info(len(ret_overall_list))

    # detail
    rdd2 = rdd.map(convert_list).filter(lambda tup: tup != None) \
        .map(convert_copy_logtype).values().flatMap(list)
예제 #55
0
if __name__ == "__main__":
    conf = SparkConf().setMaster("local[2]").setAppName("sql_udf")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    json_path = os.path.abspath("../doc/book.json")

    # json读取并隐射
    json_df = sqlContext.read.json(json_path)
    json_df.registerTempTable("json_book")

    # UDF自定义函数
    sqlContext.registerFunction("name_place", name_place)

    evalRDD = sqlContext.sql("SELECT name_place(name, place, price,evaluation) AS book_eval FROM json_book")

    #bookMap = lengthRDD.map(lambda books: (books.name, books.author, books.price, books.publish, books.place))

    evalRDD.show()

    # 查询结果进行隐射
    bookMap = evalRDD.map(lambda books: (books.book_eval))

    general_list = []
    good_list = []

    for book in bookMap.collect():
        book = book.encode("utf-8").split(',')
        print book[0], book[1]
        if book[1] == "general":
예제 #56
0
def main():
    conf = SparkConf().setAppName('housingprice')
    sc = SparkContext(conf=conf)

    sqlContext = SQLContext(sc)
    taxreportSchema = StructType([
        StructField('PID', StringType(), False),
        StructField('Legal_Type', StringType(), False),
        StructField('FOLIO', StringType(), False),
        StructField('Coordinates', StringType(), True),
        StructField('ZoneName', StringType(), True),
        StructField('ZoneCat', StringType(), True),
        StructField('LOT', StringType(), True),
        StructField('Block', StringType(), True),
        StructField('plan', StringType(), True),
        StructField('DisLot', StringType(), True),
        StructField('FCiviNum', StringType(), True),
        StructField('TCiviNum', StringType(), True),
        StructField('StreetName', StringType(), True),
        StructField('PostalCode', StringType(), True),
        StructField('NLegalName1', StringType(), True),
        StructField('NLegalName2', StringType(), True),
        StructField('NLegalName3', StringType(), True),
        StructField('NLegalName4', StringType(), True),
        StructField('NLegalName5', StringType(), True),
        StructField('CurVal', StringType(), True),
        StructField('CurImpVal', StringType(), True),
        StructField('Taxassess', StringType(), True),
        StructField('prevVal', StringType(), True),
        StructField('prevImpVal', StringType(), True),
        StructField('YearBuilt', StringType(), True),
        StructField('BigImpYear', StringType(), True),
        StructField('Tax_levy', StringType(), True),
        StructField('NeighbourhoodCode', StringType(), True),
    ])
    conversionSchema = StructType([
        StructField('date', StringType(), False),
        StructField('USD', StringType(), False),
        StructField('rate', StringType(), False),
        StructField('reciprate', StringType(), False),
    ])
    crudeoilSchema = StructType([
        StructField('date', DateType(), False),
        StructField('oilprice', StringType(), False),
    ])
    def fixdate(convVal):
        a = convVal.split(" ")
        dates = a[0].split("/")
        alldate = "20"+dates[2]+'/'+dates[0]
        return (alldate,a[1])
    def filterYear(dates):
        a = dates.split('/')
        if (a[1]=='2016'):
            return False
        else:
            return True
    def processDate(df):
        def splitMonth(cols):
         a = cols.split('/')
         return a[1]

        def splitYear(cols):
         a = cols.split('/')
         return a[0]

        fUDF = udf(splitMonth, StringType())
        df1 =  df.withColumn("month", fUDF('year'))
        fUDFyear = udf(splitYear, StringType())
        return df1.withColumn("year", fUDFyear('year'))
    #Reading the Tax Report Dataset
    taxreportinfo = sqlContext.read.format('com.databricks.spark.csv').options(header='true').schema(taxreportSchema).load(inputs+"taxreport/test")
    taxreportinfo.registerTempTable("taxreport")
    #Selecting the price,TaxAssessment Year and Postalcode of each property
    propertyVal = sqlContext.sql("SELECT CurVal, Taxassess, PostalCode FROM taxreport")
    propertyVal.registerTempTable("propertyVal")
    #Reading the CAN to USD conversion dataset
    conversion = sqlContext.read.format('com.databricks.spark.csv').options(header='true').schema(conversionSchema).load(inputs+"conversion")
    conversion.registerTempTable("Conversion")
    #Selecting only the date and rate
    conversionrate = sqlContext.sql("SELECT date,rate FROM Conversion WHERE rate regexp '^[0-9]+'")
    conversionRDD = conversionrate.repartition(40).rdd.map(lambda w: (w.date+" "+w.rate))
    conversiondates = conversionRDD.map(fixdate).filter(lambda (w,x):filterYear(w)).map(lambda l: Row(date=l[0], rate=l[1]))
    schemaConv = sqlContext.inferSchema(conversiondates)
    schemaConv.registerTempTable("ConversionDate")
    ConverDF = sqlContext.sql(" SELECT date,CAST(AVG(rate) AS DECIMAL(4,2)) as conversionrate FROM ConversionDate WHERE rate IS NOT NULL GROUP BY date")
    ConverDF.cache()
    #Reading the Canada Crude oil price dataset
    crudeoil = sc.textFile(inputs+"crudeoil")
    crudeoilRDD = crudeoil.map(lambda l: l.split()).map(lambda l: Row(date=l[0], oilprice=l[1]))
    crudeoilDF = sqlContext.inferSchema(crudeoilRDD)
    crudeoilDF.registerTempTable("crudeoil")
    #Selecting the date on M/Y format and oilprice
    oilprice = sqlContext.sql("SELECT DATE_FORMAT(date,'Y/M') as date,oilprice FROM crudeoil")
    oilprice.registerTempTable('oilprice')
    #Reading the interestrate of BC Dataset
    interestRate = sqlContext.read.format('com.databricks.spark.csv').options(header='true').load(inputs+"interestrate")
    interestRate.registerTempTable("interest")
    #Selecting the date and 5-year fixed mortgage price from the dataset
    interestDF = sqlContext.sql("SELECT DATE_FORMAT(date,'Y/M') as date,CAST(`5y-fixed-posted` AS DECIMAL(4,2)) AS interestrate FROM interest WHERE date >='2006-01' AND date <= '2015-12'")
    interestDF.registerTempTable("allrates")
    #Getting the average of each month on days whose value is not null.
    avgInterest = sqlContext.sql(" SELECT date,AVG(interestrate) as interestrates FROM allrates WHERE interestrate IS NOT NULL GROUP BY date")
    avgInterest.cache()
    joinedTable = avgInterest.join(oilprice,(avgInterest['date']==oilprice['date'])).select(avgInterest['date'],avgInterest['interestrates'],oilprice['oilprice'])
    JoinedConversion = joinedTable.join(ConverDF,(joinedTable['date']==ConverDF['date'])).select(joinedTable['date'].alias('year'),joinedTable['interestrates'],joinedTable['oilprice'],ConverDF['conversionrate'])
    JoinedConversion.registerTempTable("joinedConversion")
    ls = processDate(JoinedConversion)
    ls.show()