def get_hierarchy_landownership(company_id: str): """ Print DataFrame with company relations and landownserhip details. parameters: 1. company_id: String name of the company whoes hierarchy and landownership is to be printed returns: dataframe of ownership and company relations results. """ # read company relations path is hardcoded for now df_cr = extract_company_relations( "dbfs:/FileStore/tables/company_relations.txt") # read landownership path is hardcoded for now df_lo = extract_land_ownership("dbfs:/FileStore/tables/land_ownership.txt") # create graphframe for company relations and landownership gf = create_graph_company_relations_land(df_cr, df_lo) # using the shortest path algo to find distances between all nodes and # the one provided to process results = gf.shortestPaths(landmarks=[company_id]) # create a hierarchy level based on distances in the shortest path # returned MapType field results = results.withColumn("hier_level", map_values("distances")[0]) # filter all the results where the hierarchy is 0 or higher. Basically # removing all the NULL results. results = results.where("hier_level >= 0") return results
def expand(df, c, n, sep='_'): t = df.schema[c].dataType if isinstance(t, T.ArrayType): ce = lambda i: F.col(c)[i] elif isinstance(t, T.MapType): ce = lambda i: F.map_values(c)[i] else: ce = lambda i: F.col(c) keys = n if isinstance(n, (list, tuple, range)) else range(n) sel = lambda c: [ce(i).alias(f'{c}{sep}{str(i)}') for i in keys] cols = [sel(c) if x == c else [x] for x in df.columns] cols = [item for sublist in cols for item in sublist] return df.select(*cols)
def transform(self): """ transforms the data with the given `self.schema` to the format suitable for the SQL queries :return: Pyspark DF """ self.df = self.df\ .withColumn('items', F.explode('items')) \ .withColumn('ID', F.map_keys("items")[0]) \ .withColumn('qp', F.map_values('items')[0]) \ .withColumn('quantity', F.col('qp').getItem('quantity').astype('int')) \ .withColumn('price', F.col('qp').getItem('price')) \ .select(['user', 'timestamp', 'ID', 'quantity', 'price']) return self.df
def parquet_revalue(vcf, indel_com): temp = indel_com.join(vcf, ["#CHROM", "POS"], "full") sample_name = temp.columns[-1] sample_w = Window.partitionBy(F.col("#CHROM")).orderBy( F.col("POS")).rangeBetween(Window.unboundedPreceding, Window.currentRow) temp = temp.withColumn( sample_name, F.last(sample_name, ignorenulls=True).over(sample_w)).withColumnRenamed( "#CHROM", "CHROM") # scala UDF null_not_value = temp.filter(F.map_keys(F.col(sample_name)) != F.col("FORMAT"))\ .selectExpr("CHROM", "POS","index2dict({}, FORMAT) as {}".format(sample_name, sample_name))\ .withColumn(sample_name, F.concat(F.lit("./.:"), F.array_join(F.col(sample_name), ":"))) null_value = temp.filter(F.map_keys(F.col(sample_name)) == F.col("FORMAT")).drop("FORMAT")\ .withColumn(sample_name, F.concat(F.lit("./.:"), F.array_join(F.map_values(F.col(sample_name)), ":"))) value_union = null_not_value.union(null_value).withColumnRenamed( "CHROM", "#CHROM") return value_union
def get_spark_commits(date_str): # 2.1: Change the github_api_url so that it queries with the input date # Convert the date string into date formate fromDate = datetime.strptime(date_str, '%Y%m%d').date() toDate = fromDate + timedelta(days=1) # Construct the Git URL to fetch JSON Object(s) request = 'https://api.github.com/repos/apache/spark/commits?since=' + str( fromDate) + 'T00:00:00Z&until=' + str(toDate) + 'T00:00:00' print('Beginning file download from: ' + request) import urllib.request, urllib.error try: # Get the json object(s) with Git URL response = urllib.request.urlopen(request) except urllib.error.HTTPError as e: # Return code error (e.g. 404, 501, ...) print('HTTPError: {}'.format(e.code)) except urllib.error.URLError as e: # Not an HTTP-specific error (e.g. connection refused) print('URLError: {}'.format(e.reason)) else: # 200 sourceJASON = response.read() import pandas as pd # response.read() returns a bytes object, which is just a sequence of bytes. # You need to decode it first, because Python doesn't know what the bytes represent. jsonData = json.loads(sourceJASON.decode('utf-8')) from pyspark import SparkContext # Create Spark Context Directly by passing the config parameters sc = SparkContext("local[*]", "PySpark Electronic Arts Test") # from pyspark import SparkSession from pyspark.sql import SparkSession spark = SparkSession(sc) # Create a Spark DataFrame from a Pandas DataFrame using Arrow # Pandas DataFrame is not distributed it exists on Driver node only # Inorder to acheive parallisam we need to distribute the data across the cluster # Spark DataFrame will distribute the DataFrame source_df = spark.createDataFrame(pd.DataFrame(jsonData)) source_df.printSchema source_df.show() from pyspark.sql.types import DateType, IntegerType # Create a new DataFrame by selecting only few Key Value Pairs from the original JSON Object(s) jsonDF = source_df.select(source_df.sha.alias('sha') \ , source_df.author.login.alias('login_name') \ , source_df.committer.id.cast(IntegerType()).alias('commiter_id') \ , F.concat_ws(' ', F.map_values(source_df.commit.message)).alias('message') \ , source_df.commit.author.date.cast(DateType()).alias('commit_date') \ , source_df.commit.author.email.alias('email') \ , F.substring_index(source_df.commit.author.email, '@', -1).alias('email_company') \ , source_df.url.alias('url')) # Save this DataFrame in memory as it will be used multiple times in the future jsonDF.cache() jsonDF.printSchema jsonDF.show() # Set Parameters for PostgreSQL Database Connection url_connect = "jdbc:postgresql://pa1postgreserver.postgres.database.azure.com:5432/postgres?" commitTable = "F_SPARK_COMMITS" authorTable = "F_SPARK_AUTHORS" mode = "append" db_properties = { "user": "******", "password": "******", "driver": "org.postgresql.Driver" } # Read the Authors Table from PostgreSQL DB into a Spark DataFrame Object readAuthorTableDF = spark.read.jdbc(url=url_connect, table=authorTable, properties=db_properties) # Check if the Authors table is empty or not # If the table in the db is empty then insert the authors dataframe directly # If the table is not empty join the 2 author tables and filter the existing authors in db_properties # Insert only the new author records into DB table if len(readAuthorTableDF.head(1)) > 0: authDF = jsonDF.join(readAuthorTableDF, jsonDF.login_name == readAuthorTableDF.login_name, how='left') \ .filter(readAuthorTableDF.login_name.isNull()) \ .select(jsonDF.login_name \ , jsonDF.commiter_id \ , jsonDF.email \ , jsonDF.email_company) else: authDF = jsonDF.select(jsonDF.login_name \ , jsonDF.commiter_id \ , jsonDF.email \ , jsonDF.email_company) authDF.write.jdbc(url=url_connect, table=authorTable, mode="append", properties=db_properties) authDF.show() # Read the Authors table after insearting the new authors readAuthorTableDF = spark.read.jdbc(url=url_connect, table=authorTable, properties=db_properties) # Read the Commits Table from PostgreSQL DB into a Spark DataFrame Object before Update readCommitTableDF = spark.read.jdbc(url=url_connect, table=commitTable, properties=db_properties) # Create DataFrame by joining the DataFrame which is createded from the source JSON with the authors table contents # Do a InnerJoin with authors dbtable data frame to fetch only the records that have a commit_id in authors table commitDF = jsonDF.join(readAuthorTableDF, jsonDF.commiter_id == readAuthorTableDF.commiter_id, how='inner') from pyspark.sql import Row # Check if the Commits table is empty or not # If the table in the db is empty then insert the commits dataframe directly # If the table is not empty then check the last executed date in the commits db table # Now filter all the records with the current date as last executed datetime # The above step will make sure the process is idempotent. # Insert only the new author records into Commits DB table if len(readCommitTableDF.head(1)) > 0: maxDate = readCommitTableDF.orderBy( readCommitTableDF.creation_date.desc()).head(1)[0].creation_date commitDF = commitDF.filter(F.current_timestamp().cast(DateType()) != maxDate).select(jsonDF.sha \ , jsonDF.url \ , jsonDF.message \ , jsonDF.commit_date \ , readAuthorTableDF.author_id \ , readAuthorTableDF.creation_date) else: commitDF = commitDF.select(jsonDF.sha\ , jsonDF.url\ , jsonDF.message , jsonDF.commit_date\ , readAuthorTableDF.author_id\ , readAuthorTableDF.creation_date) commitDF.show() commitDF.write.jdbc(url=url_connect, table=commitTable, mode="append", properties=db_properties)
def task_2(data_io, product_data): # -----------------------------Column names-------------------------------- # Inputs: salesRank_column = 'salesRank' categories_column = 'categories' asin_column = 'asin' # Outputs: category_column = 'category' bestSalesCategory_column = 'bestSalesCategory' bestSalesRank_column = 'bestSalesRank' # ------------------------------------------------------------------------- # ---------------------- Your implementation begins------------------------ ### category data = product_data.withColumn( category_column, F.when(F.col(categories_column)[0][0] == '', None).otherwise(F.col(categories_column)[0][0])) category_nulls = data.filter(F.col(category_column).isNull()).count() category_distinct = data.agg(F.countDistinct( F.col(category_column))).head()[0] ### salesRank and salesCategory key_and_values = data.select( asin_column, category_column, F.map_keys(salesRank_column)[0].alias(bestSalesCategory_column), F.map_values(salesRank_column)[0].alias(bestSalesRank_column)) mean_of_salesRank = key_and_values.select( F.avg(F.col(bestSalesRank_column))).head()[0] variance_of_salesRank = key_and_values.select( F.variance(F.col(bestSalesRank_column))).head()[0] salesCategory_nulls = key_and_values.filter( F.col(bestSalesCategory_column).isNull()).count() salesCategory_distinct = key_and_values.agg( F.countDistinct(F.col(bestSalesCategory_column))).head()[0] # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = { 'count_total': None, 'mean_bestSalesRank': None, 'variance_bestSalesRank': None, 'numNulls_category': None, 'countDistinct_category': None, 'numNulls_bestSalesCategory': None, 'countDistinct_bestSalesCategory': None } # Modify res: res['count_total'] = data.count() res['mean_bestSalesRank'] = mean_of_salesRank res['variance_bestSalesRank'] = variance_of_salesRank res['numNulls_category'] = category_nulls res['countDistinct_category'] = category_distinct res['numNulls_bestSalesCategory'] = salesCategory_nulls res['countDistinct_bestSalesCategory'] = salesCategory_distinct # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_2') return res
def task_2(data_io, product_data): # -----------------------------Column names-------------------------------- # Inputs: salesRank_column = 'salesRank' categories_column = 'categories' asin_column = 'asin' # Outputs: category_column = 'category' bestSalesCategory_column = 'bestSalesCategory' bestSalesRank_column = 'bestSalesRank' # ------------------------------------------------------------------------- # ---------------------- Your implementation begins------------------------ res_df = product_data.select( product_data.categories[0][0].alias(category_column), F.map_keys(product_data.salesRank)[0].alias(bestSalesCategory_column), F.map_values( product_data.salesRank)[0].alias(bestSalesRank_column)).replace( {'': None}, subset=[ category_column, bestSalesCategory_column, bestSalesRank_column ]) stats = res_df.agg( F.count("*").alias('count_total'), F.avg(bestSalesRank_column).alias('mean_bestSalesRank'), F.variance(bestSalesRank_column).alias('variance_bestSalesRank'), F.sum( F.isnull(category_column).cast('int')).alias('numNulls_category'), F.countDistinct(res_df.category).alias('countDistinct_category'), F.sum(F.isnull(bestSalesCategory_column).cast('int')).alias( 'numNulls_bestSalesCategory'), F.countDistinct(res_df.bestSalesCategory).alias( 'countDistinct_bestSalesCategory')).head() # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = { 'count_total': None, 'mean_bestSalesRank': None, 'variance_bestSalesRank': None, 'numNulls_category': None, 'countDistinct_category': None, 'numNulls_bestSalesCategory': None, 'countDistinct_bestSalesCategory': None } # Modify res: res['count_total'] = int(stats[0]) res['mean_bestSalesRank'] = float(stats[1]) res['variance_bestSalesRank'] = float(stats[2]) res['numNulls_category'] = int(stats[3]) res['countDistinct_category'] = int(stats[4]) res['numNulls_bestSalesCategory'] = int(stats[5]) res['countDistinct_bestSalesCategory'] = int(stats[6]) # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_2') return res
df3=df.rdd.map(lambda x: \ (x.name,x.properties["hair"],x.properties["eye"])) \ .toDF(["name","hair","eye"]) df3.printSchema() df3.show() df.withColumn("hair",df.properties.getItem("hair")) \ .withColumn("eye",df.properties.getItem("eye")) \ .drop("properties") \ .show() df.withColumn("hair",df.properties["hair"]) \ .withColumn("eye",df.properties["eye"]) \ .drop("properties") \ .show() from pyspark.sql.functions import explode df.select(df.name,explode(df.properties)).show() from pyspark.sql.functions import map_keys df.select(df.name,map_keys(df.properties)).show() from pyspark.sql.functions import map_values df.select(df.name,map_values(df.properties)).show() #from pyspark.sql.functions import explode,map_keys #keysDF = df.select(explode(map_keys(df.properties))).distinct() #keysList = keysDF.rdd.map(lambda x:x[0]).collect() #print(keysList)
def task_2(data_io, product_data): # -----------------------------Column names-------------------------------- # Inputs: salesRank_column = 'salesRank' categories_column = 'categories' asin_column = 'asin' # Outputs: category_column = 'category' bestSalesCategory_column = 'bestSalesCategory' bestSalesRank_column = 'bestSalesRank' # ------------------------------------------------------------------------- # ---------------------- Your implementation begins------------------------ first_item_ = product_data['categories'][0][0] procesed_ = product_data.withColumn(category_column, first_item_) null_ = F.when(procesed_.category == '', None).otherwise(procesed_.category) procesed_ = procesed_.withColumn(category_column, null_) map_key = F.map_keys('salesRank')[0] procesed_ = procesed_.withColumn('bestSalesCategory', map_key) map_value = F.map_values('salesRank')[0] procesed_ = procesed_.withColumn('bestSalesRank', map_value) count_total, mean_bestSalesRank, variance_bestSalesRank = procesed_.agg(F.count('asin'), \ F.mean('bestSalesRank'), F.variance('bestSalesRank')).collect()[0] countDistinct_category = procesed_.filter(procesed_["category"] != '') countDistinct_category = countDistinct_category.groupBy("category") countDistinct_category = countDistinct_category.agg( F.countDistinct("category")).count() sales = procesed_.select('bestSalesCategory').filter( \ procesed_.bestSalesCategory.isNotNull()) numNulls_bestSalesCategory, temp = procesed_.agg(F.sum((F.isnull(procesed_[bestSalesCategory_column])).cast("int")),\ F.sum((F.isnull(procesed_[bestSalesCategory_column])).cast("int"))).collect()[0] numNulls_category, temp = procesed_.agg(F.sum((F.isnull(procesed_[category_column])).cast("int")),\ F.sum((F.isnull(procesed_[category_column])).cast("int"))).collect()[0] countDistinct_bestSalesCategory, temp = \ sales.agg(F.countDistinct(procesed_.bestSalesCategory).alias('bestSalesCategory'),\ F.countDistinct(procesed_.bestSalesCategory).alias('bestSalesCategory')\ ).collect()[0] # ------------------------------------------------------------------------- # ---------------------- Put results in res dict -------------------------- res = { 'count_total': None, 'mean_bestSalesRank': None, 'variance_bestSalesRank': None, 'numNulls_category': None, 'countDistinct_category': None, 'numNulls_bestSalesCategory': None, 'countDistinct_bestSalesCategory': None } # Modify res: res['count_total'] = count_total res['mean_bestSalesRank'] = mean_bestSalesRank res['variance_bestSalesRank'] = variance_bestSalesRank res['numNulls_category'] = numNulls_category res['countDistinct_category'] = countDistinct_category res['numNulls_bestSalesCategory'] = numNulls_bestSalesCategory res['countDistinct_bestSalesCategory'] = countDistinct_bestSalesCategory print(res) # ------------------------------------------------------------------------- # ----------------------------- Do not change ----------------------------- data_io.save(res, 'task_2') return res