Exemplo n.º 1
0
class MainApp(object):
    def __init__(self):
        pass
    
    def init(self):
        os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
        # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
        # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
        conf = SparkConf()
        conf.setMaster("local[10]")
        conf.setAppName("PySparkShell")
        conf.set("spark.executor.memory", "2g")
        conf.set("spark.driver.memory", "1g")
        self.sc = SparkContext(conf=conf)
        self.sqlContext = SQLContext(self.sc)        

    def loadData(self):
        category_list = self.sc.textFile("/Users/abhinavrungta/Desktop/uf-study/snc/github/SNC-WEB/src/yahoo/ydata-ymovies-user-movie-ratings-train-v1_0.txt").map(lambda line: (int(line.split(',')[0]), int(line.split(',')[1]), float(line.split(',')[2]), long(line.split(',')[3])))
        category_schema = StructType([
            StructField("userid", IntegerType(), True),
            StructField("movieid", IntegerType(), True),
            StructField("rating", FloatType(), True),
            StructField("time", LongType(), True)
        ])
        category_list = self.sqlContext.createDataFrame(category_list, category_schema)
        category_list.registerTempTable("data")
        movie_list = self.sqlContext.sql("SELECT movieid, COUNT(movieid) AS ct FROM data GROUP BY movieid")
        movie_list.registerTempTable("movie")
        movieid = movie_list.sort(movie_list.ct.desc()).first().movieid
        # movieid = category_list.first().movieid
        category_list = self.sqlContext.sql("SELECT * FROM data WHERE movieid = {0}".format(movieid))
        category_list.registerTempTable("data")
        user_list = self.sqlContext.sql("SELECT DISTINCT userid FROM data LIMIT 50")
        print(user_list.count())
        user_list.show()
        user_list.registerTempTable("users")
        category_list = self.sqlContext.sql("SELECT d.userid AS userid, d.movieid AS movieid, d.rating AS rating, d.time AS time FROM data d, users u WHERE d.userid = u.userid").repartition(1)
        #category_list = self.sqlContext.createDataFrame(category_list, category_schema)
        category_list = category_list.map(lambda line: str(line.userid) + "," + str(line.movieid) + "," + str(line.rating) + "," + str(line.time))
        category_list = category_list.repartition(1)
        category_list.saveAsTextFile("data.txt")
def read_existing_parquet():
    sc = get_configured_context()
    sql_context = SQLContext(sparkContext=sc)

    # Loads parquet file located in AWS S3 / minio into RDD Data Frame
    parquet_file = sql_context.read.parquet("s3a://testparquet/nation.parquet")
    parquet_file.registerTempTable("parquet_file")

    # Run standard SQL queries against temporary table
    nations_all_sql = sql_context.sql("SELECT * FROM parquet_file")
    nations_all = nations_all_sql.rdd.map(
        lambda p: "Country: {0:15} Ipsum Comment: {1}".format(
            p.N_NAME, p.N_COMMENT))

    for idx, nation in enumerate(nations_all.collect()):
        if idx == 0:
            print("All Nations and Comments -- `SELECT * FROM parquet_file`")
            print_horizontal()
        print(nation)
    else:
        print_horizontal()

    # Use standard SQL to filter
    nations_filtered_sql = sql_context.sql(
        "SELECT N_NAME FROM parquet_file WHERE N_NAME LIKE '%IND%'")
    nations_filtered = nations_filtered_sql.rdd.map(
        lambda p: "Country: {0:20}".format(p.N_NAME))

    for idx, nation in enumerate(nations_filtered.collect()):
        if idx == 0:
            print(
                "Nations Filtered -- `SELECT name FROM parquet_file WHERE name LIKE '%IND%'`"
            )
            print_horizontal()
        print(nation)
    else:
        print_horizontal()
# District - particular Time -Crime -Max Count
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql.context import SQLContext

input = sys.argv[1]
conf = SparkConf().setAppName('Crime Analysis')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

#loading data into parquet file
crime = sqlContext.read.parquet(input).cache()
crime.registerTempTable('Crime')

Crime_Month = sqlContext.sql('''select Month,count(Category)
                        from Crime   group by Month
                    ''')

Crime_Time = sqlContext.sql('''select substr(Time,1,2),count(Category)
                        from Crime group by  substr(Time,1,2)
                    ''')

# caching the output as it will be required
Crime_Time_Category = sqlContext.sql(
    '''select substr(Time,1,2) as Hour,PdDistrict,Category,count(Category) as cat_cnt
                        from Crime where Category<>'OTHER OFFENSES' group by  substr(Time,1,2),PdDistrict,Category
                    ''').cache()

Crime_Time_Category.registerTempTable('Crime_Time_Category')

Crime_Time_Category_Max = sqlContext.sql(
from pyspark.sql.context import SQLContext

input = sys.argv[1]
conf = SparkConf().setAppName('Crime Analysis')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

#loading crime data from parquet files

crime = sqlContext.read.parquet(input)
crime.registerTempTable('Crime')

#loading distinct district and their distinct addresses  from crime data

Crime_District = sqlContext.sql('''select distinct PdDistrict,Address
                                 from Crime
                              ''')

Crime_District.registerTempTable('Crime_District')

#assigning  number one for distinct address and also calculating crimes occured in that address and district

Crime_Add_Cnt = sqlContext.sql(
    '''select PdDistrict,Address,'1' as one ,count(Category) as Crime_count from Crime group by PdDistrict,Address order by PdDistrict'''
)
Crime_Add_Cnt.registerTempTable('Crime_Address_count')

Crime_Address_count = sqlContext.sql(
    '''select PdDistrict,sum(one) as x,sum(Crime_count) as y from Crime_Address_count where Crime_count<>'1' and PdDistrict in ('NORTHERN','SOUTHERN','CENTRAL') group by PdDistrict '''
)
Exemplo n.º 5
0
class MainApp(object):
    def __init__(self):
        pass

    def init(self):
        os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
        # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
        # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
        conf = SparkConf()
        conf.setMaster("local")
        conf.setAppName("PySparkShell")
        conf.set("spark.executor.memory", "2g")
        # conf.set("spark.driver.memory", "1g")
        self.sc = SparkContext(conf=conf)
        self.sqlContext = SQLContext(self.sc)

    def loadData(self):
        self.df_review = self.sqlContext.read.json(
            "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json"
        ).cache()
        # self.df_review = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_review.json").cache()
        self.df_business = self.sqlContext.read.json(
            "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json"
        ).cache()
        # self.df_business = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_business.json").cache()
        self.df_review.registerTempTable("reviews")
        self.df_business.registerTempTable("business")

    def createCheckInDataPerUser(self):
        review_user = self.sqlContext.sql("SELECT business_id, user_id FROM reviews")
        business_loc = self.sqlContext.sql("SELECT business_id, latitude, longitude FROM business")
        review_user.registerTempTable("reviews_user")
        business_loc.registerTempTable("business_loc")

        self.df_join_reviewAndBusiness = self.sqlContext.sql(
            "SELECT r.user_id, b.latitude, b.longitude FROM reviews_user r JOIN business_loc b ON r.business_id = b.business_id"
        ).cache()
        self.df_join_reviewAndBusiness.registerTempTable("userBusiness")

        self.df_unique_users = self.sqlContext.sql(
            'SELECT DISTINCT user_id FROM userBusiness where user_id = "SIfJLNMv7vBwo-fSipxNgg"'
        )
        self.df_unique_users.registerTempTable("users")

        pd = self.df_join_reviewAndBusiness.toPandas()
        global_db = self.sc.broadcast(pd)

        schema = StructType([StructField("latitude", FloatType()), StructField("longitude", FloatType())])
        partialFunc = partial(getLocationsOfUser, business_db=global_db.value)

        self.get_locations = udf(partialFunc, ArrayType(schema))
        self.get_centers = udf(getCentersOfUser, ArrayType(schema))

        self.df_unique_users = self.df_unique_users.withColumn(
            "user_locations", self.get_locations(self.df_unique_users["user_id"])
        )
        self.df_unique_users.registerTempTable("users")

        self.df_unique_users.repartition(1).write.save("user.json", "json", "overwrite")

        print(getCentersOfUser(self.df_unique_users.toPandas().iloc[0]["user_locations"]))

        self.df_unique_users = self.df_unique_users.withColumn(
            "user_centers", self.get_centers(self.df_unique_users["user_locations"])
        )
        self.df_unique_users.registerTempTable("users")

        self.df_unique_users.repartition(1).write.save("center.json", "json", "overwrite")
        self.df_unique_users.show()

    def distanceCalc(self):
        self.df_unique_users = self.sqlContext.read.json(
            "user.json/part-r-00000-23a1b514-f5fe-4f61-9a64-01ebbc88c146"
        ).cache()
        print(len(getCentersOfUser(self.df_unique_users.toPandas().iloc[0]["user_locations"])))
Exemplo n.º 6
0
    # register udf to calculate hash value of column using md5 algorithm(Using SQL DSL)
    sql_context.udf.register("get_hash_value", get_hash_value, StringType())

    # register udf to check column value is null not
    sql_context.udf.register("is_null", isNull, StringType())

    # register dataframe to perform sql DSL
    type_2_dataframe.registerTempTable("type2_data")

    # remove null value from column
    #null_value_query = "select _c3,is_null(_c3) as new_c3 from type2_data"
    #null_value_query_df = sql_context.sql(null_value_query)
    #null_value_query_df.show()

    # get hash value for column first and second
    hash_record_query = "select get_hash_value(CONCAT(_c1,_c2)) as record_hashvalue,*,get_hash_value(CONCAT(_c0,_c1,_c2,_c3,_c4,_c8,_c9,_c10)) as record_hash_row,from_unixtime(unix_timestamp(_c5, 'MMM dd yyyy hh:mm:ss.SSSaaa')) as init_effective_date,from_unixtime(unix_timestamp(_c6, 'MMM dd yyyy hh:mm:ss.SSSaaa')) as init_expiration_date from type2_data"
    hash_record_query_df = sql_context.sql(hash_record_query)
    hash_record_query_df.show()

    #populate effective date based on init_expiration_date and init_expiration_date and init_effective_date
    effective_date_df = hash_record_query_df.withColumn(
        "effective_date",
        when(
            hash_record_query_df.init_expiration_date == '1991-01-01 00:00:00',
            hash_record_query_df.init_effective_date).otherwise(
                hash_record_query_df.init_expiration_date))
    effective_date_df.show()

    # register dataframe to perform sql DSL
    effective_date_df.registerTempTable("date_table")
Exemplo n.º 7
0
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql.context import SQLContext, HiveContext
from pyspark.sql.functions import *

inputs = sys.argv[1]
conf = SparkConf().setAppName('Month Wise Top 3 Crime')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
hiveContext = HiveContext(sc)

crime = sqlContext.read.parquet(inputs)
crime.registerTempTable('Crime')

Crime_month = sqlContext.sql('''select  Month,Category,count(Category) as cnt
    from Crime group by Month,Category order by Month
    ''')
Crime_month = Crime_month.na.replace(
    ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'], [
        'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct',
        'Nov', 'Dec'
    ], 'Month')

Crime_month.registerTempTable('Crime_month')
#loading only aggregated records to save in csv so that hive can Query only less amount of records .
Crime_month.coalesce(1).write.format('com.databricks.spark.csv').save(
    'MonthCategory')
# Using Hive and creating table from csv
hiveContext.sql("DROP TABLE IF EXISTS Crime_month")
hiveContext.sql(
    "CREATE TABLE Crime_month (Month STRING, Category STRING, counts int) row format delimited fields terminated by ',' stored as textfile"
Exemplo n.º 8
0
    lambda x: x is not None).flatMap(lambda x: x).coalesce(1)
graphedges = graphedges_rdd.toDF(['source', 'destination']).cache()
graphedges.registerTempTable('SourceDestTable')

initial_node = source_node
initial_row = KnownRow(initial_node, initial_node, 0)
knownpaths = sqlContext.createDataFrame([initial_row], schema=schema)
part_knownpaths = knownpaths

for i in range(6):
    part_knownpaths.registerTempTable('PartKnownPathTable')

    newpaths = sqlContext.sql("""
    SELECT destination AS node, t1.source AS source, (distance+1) AS distance FROM
    SourceDestTable t1
    JOIN
    PartKnownPathTable t2
    ON (t1.source = t2.node)
    """).show()

    newpaths.registerTempTable('NewPathTable')
    knownpaths.registerTempTable('KnowPathTable')
    duplicate_df = sqlContext.sql("""
    SELECT t1.node AS node, t1.source as source, t1.distance as distance FROM
    NewPathTable t1
    JOIN
    KnowPathTable t2
    ON (t1.node = t2.node)
    """).show()

    if duplicate_df.count() != 0:
# for front-end purposes displaying 2016 year with most number of crimes
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql.context import SQLContext


input = sys.argv[1]
conf = SparkConf().setAppName('Crime Analysis')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
crime=sqlContext.read.parquet(input)
crime.registerTempTable('Crime')

District_WeekCrime=sqlContext.sql('select Year,count(Category) w_cnt from Crime  where Year=2016 and Month=11 and Day>=3 group by Year order by w_cnt desc')
District_WeekCrime.registerTempTable('Week_2016')

#District_WeekCrime=sqlContext.sql('select Year,max(cnt) cnt from Week_2016 Group by Year ')
District_WeekCrime.show()

District_MonthCrime=sqlContext.sql('select Year,count(Category) m_cnt from Crime  where Year=2016 and Month=11 Group by Year order by m_cnt desc')
District_MonthCrime.registerTempTable('Month_2016')
District_MonthCrime.show()

District_YearCrime=sqlContext.sql('select Year,count(Category) y_cnt from Crime  where Year=2016  Group by Year order by y_cnt desc')
District_YearCrime.registerTempTable('Year_2016')
District_YearCrime.show()

District_Overall=sqlContext.sql('select w_cnt as week_crime_count,m_cnt as month_crime_count,y_cnt as Year_crime_count from Week_2016 W,Month_2016 M,Year_2016 Y where W.Year=M.Year and M.Year=Y.Year')
District_Overall.coalesce(1).write.mode('append').json("Crimes_Overall")

Exemplo n.º 10
0
    .option("mode", "DROPMALFORMED") \
    .load("/home/mandar/Downloads/YouTube-Spam-Collection-v1/Youtube05-Shakira.csv")
    
    you_tube_dataframe.printSchema()
    
    print "--------------------------Total number of record -------------------------"
    print you_tube_dataframe.count()
    
    you_tube_dataframe.select('CLASS').show(10)
    
    you_tube_dataframe.registerTempTable("you_tube_data")
    
       
    # Number of comment per user
    comment_per_user = "******"
    comment_per_user_df = sql_context.sql(comment_per_user)
    comment_per_user_df.show()
    
    # select author data whose value =Leonel Hernandez
    select_author = "select *,CONCAT(YEAR(DATE), '-', MONTH(DATE),'-',DAY(DATE),DAYOFMONTH(DATE),'-',DAYOFYEAR(DATE))  as date_value from you_tube_data where AUTHOR = 'Sabina Pearson-Smith'"
    select_author_df = sql_context.sql(select_author)
    select_author_df.show()
    
    # select author data whose value =Leonel Hernandez
#     select_comment_per_date = "select count(*) as total_comment_count,CONCAT(YEAR(DATE), '-', MONTH(DATE),'-',DAY(DATE))  as comment_date from you_tube_data group by CONCAT(YEAR(DATE), '-', MONTH(DATE),'-',DAY(DATE))"
#     select_comment_per_date_df = sql_context.sql(select_comment_per_date)
#     select_comment_per_date_df.show()
    
    #     from_unixtime(Column ut, java.lang.String f)
    #2015-05-29T02:30:18.971000
    
    rel['name'] = x[0]
    rel['age'] = x[1]
    return rel


peopleRDD = sc.textFile(
    "file:///usr/local/my_soft/spark-2.1.0/examples/src/main/resources/people.txt"
)

# 定义一个模式字符串
schemaString = "name age"

# 根据模式字符串生成模式
fields = list(
    map(lambda fieldName: StructField(fieldName, StringType(), nullable=True),
        schemaString.split(" ")))
schema = StructType(fields)
# 从上面信息可以看出,schema描述了模式信息,模式中包含name和age两个字段

rowRDD = peopleRDD.map(lambda line: line.split(',')).map(
    lambda attributes: Row(attributes[0], attributes[1]))

peopleDF = spark.createDataFrame(rowRDD, schema)

# 必须注册为临时表才能供下面查询使用
peopleDF.createOrReplaceTempView("people")

results = spark.sql("SELECT * FROM people")
results.rdd.map(lambda attributes: "name: " + attributes[0] + "," + "age:" +
                attributes[1]).foreach(print)
Exemplo n.º 12
0
# a) Load each file in the dataset into separate DataFrames.

businessDf = businessDF.toDF("business_id", "name", "address", "city", "state",
                             "postal_code", "latitude", "longitude", "stars",
                             "review_count", "categories")
reviewersDF = reviewersDF.toDF("review_id", "user_id", "business_id",
                               "review_text", "review_date")
friendshipDF = friendshipDF.toDF("src_user_id", "dst_user_id")

businessDF.createOrReplaceTempView("businesses")
reviewersDF.createOrReplaceTempView("reviewers")
friendshipDF.createOrReplaceTempView("friendshipGraph")

# Just to show that task 5 a) works:
b = sqlContext.sql("select count(*) from businesses")
r = sqlContext.sql("select count(*) from reviewers")
f = sqlContext.sql("select count(*) from friendshipGraph")
print("done [x]")

# Task 6

print("-------- Task (6) --------")

# a) Inner join review table and business table on business_id
# b) The result is saved in a temporary variable, "innerJoin"
innerJoin = businessDF.join(reviewersDF, "business_id")
# innerJoin.show()

# c) Number of reviews for each user in the review table for top 20 users with the most number of reviews
reviews = reviewersDF.groupBy("user_id").count().sort(desc("count")).limit(20)
Exemplo n.º 13
0
graphedges_rdd = textinput.map(lambda line: get_graphedges(line)).filter(lambda x: x is not None).flatMap(lambda x: x).coalesce(1)
graphedges = graphedges_rdd.toDF(['source', 'destination']).cache()
graphedges.registerTempTable('SourceDestTable')

initial_node = source_node
initial_row = KnownRow(initial_node, initial_node, 0)
knownpaths = sqlContext.createDataFrame([initial_row], schema=schema)
part_knownpaths = knownpaths

for i in range(6):
    part_knownpaths.registerTempTable('PartKnownPathTable')

    newpaths = sqlContext.sql("""
    SELECT destination AS node, t1.source AS source, (distance+1) AS distance FROM
    SourceDestTable t1
    JOIN
    PartKnownPathTable t2
    ON (t1.source = t2.node)
    """)

    newpaths.registerTempTable('NewPathTable')
    knownpaths.registerTempTable('KnowPathTable')
    duplicate_df = sqlContext.sql("""
    SELECT t1.node AS node, t1.source as source, t1.distance as distance FROM
    NewPathTable t1
    JOIN
    KnowPathTable t2
    ON (t1.node = t2.node)
    """)

    if duplicate_df.count() != 0:
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql.context import SQLContext, HiveContext
from pyspark.sql.functions import *

inputs = sys.argv[1]
conf = SparkConf().setAppName('Time-wise Top 3 Crimes')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
hiveContext = HiveContext(sc)

crime = sqlContext.read.parquet(inputs)
crime.registerTempTable('Crime')

Crime_Time = sqlContext.sql(
    '''select  SUBSTR(Time,1,2) as hour,Category,count(Category) as cnt
    from Crime group by SUBSTR(Time,1,2),Category order by SUBSTR(Time,1,2)
    ''')

Crime_Time.registerTempTable('Crime_Time')
#loading only aggregated records to save in csv so that hive can Query only less amount of records .
Crime_Time.coalesce(1).write.format('com.databricks.spark.csv').save(
    'TimeCategory')

# Using Hive and creating table from csv
hiveContext.sql("DROP TABLE IF EXISTS TimeCategory")
hiveContext.sql(
    "CREATE TABLE TimeCategory (Hour STRING, Category STRING, counts int) row format delimited fields terminated by ',' stored as textfile"
)
#loading csv contents into csv
hiveContext.sql(
    "LOAD DATA INPATH '/user/chandras/TimeCategory' INTO TABLE TimeCategory")
Exemplo n.º 15
0
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql.context import SQLContext

input = sys.argv[1]
conf = SparkConf().setAppName('Crime Analysis')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

# loading Crime data from parquet
time = sqlContext.read.parquet(input).cache()
time.registerTempTable('Time_table')

#
Time_table = sqlContext.sql(
    '''select DayOfWeek,SUBSTR(Time,1,2) as hour, count(Category) as cnt from Time_table group by DayOfWeek,SUBSTR(Time,1,2)'''
).cache()

Time_table.registerTempTable('Time_cnt')

Time_Max = sqlContext.sql(
    '''select DayOfWeek,max(cnt) as cnt from Time_cnt group by DayOfWeek ''')
Time_Max.registerTempTable('Time_max')

# Day wise - hour wise - Max crime calculation
Time_week_Max = sqlContext.sql(
    '''select TM.DayOfWeek,TC.hour,TM.cnt from Time_cnt TC,Time_max TM where TC.DayOfWeek=TM.DayOfWeek
and TC.cnt=TM.cnt ''')

#coalescing the final output as it a safe operation to do on our final results-it doesnt affect parallel operation much
Time_week_Max.coalesce(1).write.format('com.databricks.spark.csv').option(
Exemplo n.º 16
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging

from pyspark import SparkContext, SparkConf
from pyspark.sql.context import SQLContext
from pyspark.sql.types import Row

__author__ = 'guotengfei'
__time__ = 2019 / 11 / 26
"""
Module comment
"""

LOGGER = logging.getLogger(__name__)

if __name__ == '__main__':
    conf = SparkConf().setAppName('vcom')
    sc = SparkContext(conf=conf)
    sql = SQLContext(sc)

    lines = sc.textFile('../data/users.txt')
    user = lines.map(lambda l: l.split(",")).map(lambda p: (p[0], p[1]))

    sql.createDataFrame(user, ['id', 'name']).registerTempTable('user')
    df = sql.sql("select id,name from user")

    df.write.save("../data/result", format='json')
# sub-Crime associated with particular crime (LARCENY-THEFT)
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql.context import SQLContext

input = sys.argv[1]
conf = SparkConf().setAppName('Crime Analysis')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

crime = sqlContext.read.parquet(input).cache()
crime.registerTempTable('Crime')

Crime = sqlContext.sql('''select IncidntNum,count(IncidntNum) as inc_num_cnt
                        from Crime group by IncidntNum having count(IncidntNum)>1 
                    ''').cache()
Crime.registerTempTable('Crime_Frequency')

Crime_Frequency = sqlContext.sql(
    '''select distinct CF.IncidntNum from Crime_Frequency CF,Crime C
                                  Where CF.IncidntNum=C.IncidntNum and C.Category='LARCENY-THEFT'  '''
)
Crime_Frequency.registerTempTable('Crime_Vehicle_Theft')

# Extracting crimes that are associated with LARCENY-THEFT
Crime_Vehicle_Theft = sqlContext.sql(
    '''select  CV.IncidntNum,C.Category,count(Category) as cnt from Crime_Vehicle_Theft CV,Crime C
                                      where CV.IncidntNum=C.IncidntNum and C.Category not in ('LARCENY-THEFT','OTHER OFFENSES')
                                      group by CV.IncidntNum,C.Category having count(Category)>1'''
)
Crime_Vehicle_Theft.registerTempTable('Crime_VT_Frequency')
Exemplo n.º 18
0
from pyspark.sql.context import SQLContext,HiveContext
from pyspark.sql.functions import *

inputs=sys.argv[1]
conf = SparkConf().setAppName('District wise Analysis')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
hiveContext = HiveContext(sc)



crime=sqlContext.read.parquet(inputs)
crime.registerTempTable('Crime')

Crime_District=sqlContext.sql('''select  PdDistrict,Category,count(Category) as cnt
    from Crime group by PdDistrict,Category order by PdDistrict
    ''')

Crime_District.registerTempTable('Crime_District')

Crime_count=sqlContext.sql('''select * from Crime_District where cnt<>1''')
#loading only aggregated records to save in csv so that hive can Query only less amount of records .
Crime_count.coalesce(1).write.format('com.databricks.spark.csv').save('District_top')

# Using Hive and creating table from csv
hiveContext.sql("DROP TABLE IF EXISTS district")
hiveContext.sql("CREATE TABLE district (PdDistrict STRING, Category STRING, counts int) row format delimited fields terminated by ',' stored as textfile")
#loading csv contents into csv
hiveContext.sql("LOAD DATA INPATH '/user/chandras/District_top' INTO TABLE district")

District=hiveContext.sql("SELECT PdDistrict,Category,counts FROM (SELECT PdDistrict,Category,counts,dense_rank() OVER (PARTITION BY PdDistrict ORDER BY counts DESC) as rank FROM district) tmp WHERE rank <= 3");
#Blue Collar vs White Collar Crime
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql.context import SQLContext

input = sys.argv[1]
conf = SparkConf().setAppName('Crime Analysis')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

#loading crime data from parquet files
crime = sqlContext.read.parquet(input).cache()
crime.registerTempTable('Crime')

Crime_Total = sqlContext.sql(
    '''select Year,PdDistrict,count(Category) as cnt from Crime group by Year,PdDistrict'''
)
Crime_Total.registerTempTable('Crime_Total')

Blue_Crime = sqlContext.sql(
    '''select Year,PdDistrict,count(Category) as blue_count from Crime 
where Category in ('VANDALISM','VEHICLE THEFT','LARCENY-THEFT','ROBBERY','BURGLARY',
'STOLEN PROPERTY','DISORDERLY CONDUCT','ASSAULT','KIDNAPPING','RECOVERED VEHICLE') group by Year,PdDistrict order by Year,blue_count desc'''
).cache()

Blue_Crime.registerTempTable('Blue_Crime')

White_Crime = sqlContext.sql(
    '''select Year,PdDistrict ,count(Category) as white_count from Crime 
where Category in ('FRAUD','FORGERY-COUNTERFEITING','GAMBLING','EMBEZZLEMENT','SECONDARY CODES',
'EXTORTION','BRIBERY','BAD CHECKS','KIDNAPPING','RECOVERED VEHICLE') group by Year,PdDistrict order by Year,white_count desc'''
Exemplo n.º 20
0
class MainApp(object):
    def __init__(self):
        pass

    def init(self):
        os.environ[
            "SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
        # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
        # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
        conf = SparkConf()
        conf.setMaster("local")
        conf.setAppName("PySparkShell")
        conf.set("spark.executor.memory", "2g")
        # conf.set("spark.driver.memory", "1g")
        self.sc = SparkContext(conf=conf)
        self.sqlContext = SQLContext(self.sc)

    def loadData(self):
        self.df_review = self.sqlContext.read.json(
            "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json"
        ).cache()
        # self.df_review = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_review.json").cache()
        self.df_business = self.sqlContext.read.json(
            "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json"
        ).cache()
        # self.df_business = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_business.json").cache()
        self.df_review.registerTempTable("reviews")
        self.df_business.registerTempTable("business")

    def createCheckInDataPerUser(self):
        review_user = self.sqlContext.sql(
            "SELECT business_id, user_id FROM reviews")
        business_loc = self.sqlContext.sql(
            "SELECT business_id, latitude, longitude FROM business")
        review_user.registerTempTable("reviews_user")
        business_loc.registerTempTable("business_loc")

        self.df_join_reviewAndBusiness = self.sqlContext.sql(
            "SELECT r.user_id, b.latitude, b.longitude FROM reviews_user r JOIN business_loc b ON r.business_id = b.business_id"
        ).cache()
        self.df_join_reviewAndBusiness.registerTempTable("userBusiness")

        self.df_unique_users = self.sqlContext.sql(
            "SELECT DISTINCT user_id FROM userBusiness where user_id = \"SIfJLNMv7vBwo-fSipxNgg\""
        )
        self.df_unique_users.registerTempTable("users")

        pd = self.df_join_reviewAndBusiness.toPandas()
        global_db = self.sc.broadcast(pd)

        schema = StructType([
            StructField("latitude", FloatType()),
            StructField("longitude", FloatType())
        ])
        partialFunc = partial(getLocationsOfUser, business_db=global_db.value)

        self.get_locations = udf(partialFunc, ArrayType(schema))
        self.get_centers = udf(getCentersOfUser, ArrayType(schema))

        self.df_unique_users = self.df_unique_users.withColumn(
            "user_locations",
            self.get_locations(self.df_unique_users["user_id"]))
        self.df_unique_users.registerTempTable("users")

        self.df_unique_users.repartition(1).write.save("user.json", "json",
                                                       "overwrite")

        print(
            getCentersOfUser(
                self.df_unique_users.toPandas().iloc[0]["user_locations"]))

        self.df_unique_users = self.df_unique_users.withColumn(
            "user_centers",
            self.get_centers(self.df_unique_users["user_locations"]))
        self.df_unique_users.registerTempTable("users")

        self.df_unique_users.repartition(1).write.save("center.json", "json",
                                                       "overwrite")
        self.df_unique_users.show()

    def distanceCalc(self):
        self.df_unique_users = self.sqlContext.read.json(
            "user.json/part-r-00000-23a1b514-f5fe-4f61-9a64-01ebbc88c146"
        ).cache()
        print(
            len(
                getCentersOfUser(self.df_unique_users.toPandas().iloc[0]
                                 ["user_locations"])))
from pyspark.sql.readwriter import DataFrameReader, DataFrameWriter
from pyspark.sql.window import Window, WindowSpec

conf = SparkConf().setMaster("local").setAppName("MyApp")
sc = SparkContext(conf=conf)
spark = SQLContext(sc)


def stopSpark():
    sc.stop()


def f(x):
    rel = {}
    rel['name'] = x[0]
    rel['age'] = x[1]
    return rel

people = sc.textFile("file:///usr/local/my_soft/spark-2.1.0/examples/src/main/resources/people.txt")\
    .map(lambda line : line.split(','))\
    .map(lambda p: Row(name=p[0], age=int(p[1])))
peopleDF = spark.createDataFrame(people)

# peopleDF = sc.textFile("file:///usr/local/my_soft/spark-2.1.0/examples/src/main/resources/people.txt")\
#     .map(lambda line : line.split(','))\
#     .map(lambda x: Row(**f(x))).toDF()

peopleDF.createOrReplaceTempView("people")  # 必须注册为临时表才能供下面的查询使用
personsDF = spark.sql("select name,age from people where age>20")
personsDF.rdd.map(lambda t: "Name:" + t[0] + "," + "Age:" + str(t[1])).foreach(
    print)
Exemplo n.º 22
0
class MainApp(object):
    def __init__(self):
        pass

    def init(self):
        os.environ[
            "SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2"
        # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY>
        # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY>
        conf = SparkConf()
        conf.setMaster("local")
        conf.setAppName("My application")
        conf.set("spark.executor.memory", "2g")
        self.sc = SparkContext(conf=conf)
        self.sqlContext = SQLContext(self.sc)
        self.df_user = self.sqlContext.read.json("dataset/user.json").cache()
        self.df_review = self.sqlContext.read.json(
            "dataset/review.json").cache()
        self.df_business = self.sqlContext.read.json(
            "dataset/business.json").cache()
        self.df_user.registerTempTable("user")

    def getS3File(self, s3FilePath, destinationPathOnLocal):
        r = requests.get(s3FilePath)
        fileOb = open(destinationPathOnLocal, 'w')
        fileOb.write(r.text)
        fileOb.close()

    def writeToS3File(self, s3FilePath, sourcePathOnLocal):
        fileOb = open(sourcePathOnLocal, 'r')
        payload = fileOb.read()
        fileOb.close()

        headers = {"x-amz-acl": "public-read-write"}
        return requests.put(s3FilePath, headers=headers, data=payload)

    def reads3spark(self, path):
        # path = "s3n://b-datasets/flight_data/*"
        x = self.sc.textFile(path)  # we can just specify all the files.
        return x

    def writes3spark(self, x, path):
        x.saveAsTextFile(path)

    def createFeatures(self):
        userData = self.sqlContext.sql(
            "SELECT user_id, name, review_count, votes, fans, yelping_since, elite FROM user"
        )
        userData = userData.map(mapUsers).coalesce(1)
        res = self.sqlContext.createDataFrame(userData)

        review_user = self.df_review.select(self.df_review.business_id,
                                            self.df_review.user_id)
        business_loc = self.df_business.select(self.df_business.business_id,
                                               self.df_business.city,
                                               self.df_business.state)
        df_join_reviewAndBusiness = review_user.join(
            business_loc,
            review_user.business_id == business_loc.business_id).select(
                "user_id", "city", "state")
        df_grouped = df_join_reviewAndBusiness.groupBy(
            ["user_id", "city", "state"]).count()
        df_panda = res.toPandas()
        for name, group in df_grouped:
            if (group['city'] > 10):
                user_id = df_grouped.get_group(name)[0]['user_id']
                df_panda[user_id]['k'] = df_panda[user_id]['k'] + 1

        res = self.sqlContext.createDataFrame(df_panda)
        res.toJSON().saveAsTextFile('user_features.json')
Exemplo n.º 23
0
    swimmers.sql(
        "select name, eyeColor from swimmers where eyeColor like 'b%'").show

    # reality data
    flightPerfFilePath = "C:\\Users\\sangjiaqi\\Desktop\\软件\\Scala\\learningPySpark-master\\Chapter03\\flight-data\\departuredelays.csv"
    airportsFilePath = "C:\\Users\\sangjiaqi\\Desktop\\软件\\Scala\\learningPySpark-master\\Chapter03\\flight-data\\airport-codes-na.txt"

    airports = sqlContext.read.csv(airportsFilePath,
                                   header='true',
                                   inferSchema='true',
                                   sep='\t')
    airports.createOrReplaceTempView("airports")

    flightPerf = sqlContext.read.csv(flightPerfFilePath, header='true')
    flightPerf.createOrReplaceTempView("FlightPerformance")

    flightPerf.cache

    sqlContext.sql("""
    select a.City,
    f.origin,
    sum(f.delay) as Delays
    from FlightPerformace f
    join airports a
    on a.IATA = f.orifin
    where a.State = 'WA'
    group by a.City, f.origin
    order by sum(f.delay) desc
    """).show
Exemplo n.º 24
0
# -*- coding: utf-8 -*-
'''
Created on 2018年1月5日

@author: root
'''
from com.bjsxt.python.test import SparkUtil
from pyspark.sql.context import SQLContext
conf = SparkUtil.initSparkConf(True, "DataFrameOpsFromFile")
sc = SparkUtil.initSparkContext(conf)
sqlContext = SQLContext(sc)
df = sqlContext.read.json("../data/people.json")
df.registerTempTable("people")
sqlContext.sql("select * from people where age > 20").show()
sc.stop()
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql.context import SQLContext

input = sys.argv[1]
conf = SparkConf().setAppName('Crime Analysis')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

#loading crime data from parquet files
crime = sqlContext.read.parquet(input).cache()
crime.registerTempTable('Crime')

# taking count of resolution against the crime excluding resolution in Pending/In-progress/No Resolution
Crime_Resolution = sqlContext.sql(
    '''select PdDistrict,count(Resolution) as cnt from Crime Where Resolution<>'NONE'  Group by PdDistrict order by PdDistrict'''
)
Crime_Resolution.registerTempTable('Crime_Resolution')

# ignoring bad data
Crime_Res = sqlContext.sql(
    '''select * from Crime_Resolution where cnt<>1 ''').cache()
Crime_Res.registerTempTable('Crime_Res')

# taking count of resolution against the crime Including resolution in Pending/In-progress/No Resolution
Crime_Resolution_ALL = sqlContext.sql(
    '''select PdDistrict,count(Resolution) as cnt from Crime Group by PdDistrict order by PdDistrict'''
)
Crime_Resolution_ALL.registerTempTable('Crime_Resolution_ALL')

Crime_Res_ALL = sqlContext.sql(