示例#1
0
def get_hierarchy_landownership(company_id: str):
    """
    Print DataFrame with company relations and landownserhip details.

    parameters:
    1. company_id: String name of the company whoes hierarchy and
    landownership is to be printed

    returns:
    dataframe of ownership and company relations results.
    """
    # read company relations path is hardcoded for now
    df_cr = extract_company_relations(
        "dbfs:/FileStore/tables/company_relations.txt")

    # read landownership path is hardcoded for now
    df_lo = extract_land_ownership("dbfs:/FileStore/tables/land_ownership.txt")

    # create graphframe for company relations and landownership
    gf = create_graph_company_relations_land(df_cr, df_lo)

    # using the shortest path algo to find distances between all nodes and
    # the one provided to process
    results = gf.shortestPaths(landmarks=[company_id])

    # create a hierarchy level based on distances in the shortest path
    # returned MapType field
    results = results.withColumn("hier_level", map_values("distances")[0])

    # filter all the results where the hierarchy is 0 or higher. Basically
    # removing all the NULL results.
    results = results.where("hier_level >= 0")
    return results
示例#2
0
def expand(df, c, n, sep='_'):
    t = df.schema[c].dataType

    if isinstance(t, T.ArrayType):
        ce = lambda i: F.col(c)[i]
    elif isinstance(t, T.MapType):
        ce = lambda i: F.map_values(c)[i]
    else:
        ce = lambda i: F.col(c)

    keys = n if isinstance(n, (list, tuple, range)) else range(n)
    sel = lambda c: [ce(i).alias(f'{c}{sep}{str(i)}') for i in keys]
    cols = [sel(c) if x == c else [x] for x in df.columns]
    cols = [item for sublist in cols for item in sublist]
    return df.select(*cols)
示例#3
0
    def transform(self):
        """
        transforms the data with the given `self.schema` to the format suitable for the SQL queries

        :return: Pyspark DF
        """
        self.df = self.df\
            .withColumn('items', F.explode('items')) \
            .withColumn('ID', F.map_keys("items")[0]) \
            .withColumn('qp', F.map_values('items')[0]) \
            .withColumn('quantity', F.col('qp').getItem('quantity').astype('int')) \
            .withColumn('price', F.col('qp').getItem('price')) \
            .select(['user', 'timestamp', 'ID', 'quantity', 'price'])

        return self.df
示例#4
0
def parquet_revalue(vcf, indel_com):
    temp = indel_com.join(vcf, ["#CHROM", "POS"], "full")
    sample_name = temp.columns[-1]

    sample_w = Window.partitionBy(F.col("#CHROM")).orderBy(
        F.col("POS")).rangeBetween(Window.unboundedPreceding,
                                   Window.currentRow)
    temp = temp.withColumn(
        sample_name,
        F.last(sample_name,
               ignorenulls=True).over(sample_w)).withColumnRenamed(
                   "#CHROM", "CHROM")

    # scala UDF
    null_not_value = temp.filter(F.map_keys(F.col(sample_name)) != F.col("FORMAT"))\
                         .selectExpr("CHROM", "POS","index2dict({}, FORMAT) as {}".format(sample_name, sample_name))\
                         .withColumn(sample_name,  F.concat(F.lit("./.:"), F.array_join(F.col(sample_name), ":")))

    null_value = temp.filter(F.map_keys(F.col(sample_name)) == F.col("FORMAT")).drop("FORMAT")\
                     .withColumn(sample_name, F.concat(F.lit("./.:"), F.array_join(F.map_values(F.col(sample_name)), ":")))

    value_union = null_not_value.union(null_value).withColumnRenamed(
        "CHROM", "#CHROM")
    return value_union
示例#5
0
def get_spark_commits(date_str):
    # 2.1: Change the github_api_url so that it queries with the input date
    # Convert the date string into date formate
    fromDate = datetime.strptime(date_str, '%Y%m%d').date()
    toDate = fromDate + timedelta(days=1)

    # Construct the Git URL to fetch JSON Object(s)
    request = 'https://api.github.com/repos/apache/spark/commits?since=' + str(
        fromDate) + 'T00:00:00Z&until=' + str(toDate) + 'T00:00:00'
    print('Beginning file download from: ' + request)

    import urllib.request, urllib.error
    try:
        # Get the json object(s) with Git URL
        response = urllib.request.urlopen(request)
    except urllib.error.HTTPError as e:
        # Return code error (e.g. 404, 501, ...)
        print('HTTPError: {}'.format(e.code))
    except urllib.error.URLError as e:
        # Not an HTTP-specific error (e.g. connection refused)
        print('URLError: {}'.format(e.reason))
    else:
        # 200
        sourceJASON = response.read()

    import pandas as pd

    # response.read() returns a bytes object, which is just a sequence of bytes.
    # You need to decode it first, because Python doesn't know what the bytes represent.
    jsonData = json.loads(sourceJASON.decode('utf-8'))

    from pyspark import SparkContext
    # Create Spark Context Directly by passing the config parameters
    sc = SparkContext("local[*]", "PySpark Electronic Arts Test")

    # from pyspark import SparkSession
    from pyspark.sql import SparkSession
    spark = SparkSession(sc)
    # Create a Spark DataFrame from a Pandas DataFrame using Arrow
    # Pandas DataFrame is not distributed it exists on Driver node only
    # Inorder to acheive parallisam we need to distribute the data across the cluster
    # Spark DataFrame will distribute the DataFrame
    source_df = spark.createDataFrame(pd.DataFrame(jsonData))

    source_df.printSchema
    source_df.show()

    from pyspark.sql.types import DateType, IntegerType

    # Create a new DataFrame by selecting only few Key Value Pairs from the original JSON Object(s)
    jsonDF = source_df.select(source_df.sha.alias('sha') \
                              , source_df.author.login.alias('login_name') \
                              , source_df.committer.id.cast(IntegerType()).alias('commiter_id') \
                              , F.concat_ws(' ', F.map_values(source_df.commit.message)).alias('message') \
                              , source_df.commit.author.date.cast(DateType()).alias('commit_date') \
                              , source_df.commit.author.email.alias('email') \
                              , F.substring_index(source_df.commit.author.email, '@', -1).alias('email_company') \
                              , source_df.url.alias('url'))

    # Save this DataFrame in memory as it will be used multiple times in the future
    jsonDF.cache()
    jsonDF.printSchema
    jsonDF.show()

    # Set Parameters for PostgreSQL Database Connection
    url_connect = "jdbc:postgresql://pa1postgreserver.postgres.database.azure.com:5432/postgres?"
    commitTable = "F_SPARK_COMMITS"
    authorTable = "F_SPARK_AUTHORS"
    mode = "append"
    db_properties = {
        "user": "******",
        "password": "******",
        "driver": "org.postgresql.Driver"
    }

    # Read the Authors Table from PostgreSQL DB into a Spark DataFrame Object
    readAuthorTableDF = spark.read.jdbc(url=url_connect,
                                        table=authorTable,
                                        properties=db_properties)

    # Check if the Authors table is empty or not
    # If the table in the db is empty then insert the authors dataframe directly
    # If the table is not empty join the 2 author tables and filter the existing authors in db_properties
    # Insert only the new author records into DB table
    if len(readAuthorTableDF.head(1)) > 0:
        authDF = jsonDF.join(readAuthorTableDF, jsonDF.login_name == readAuthorTableDF.login_name, how='left') \
            .filter(readAuthorTableDF.login_name.isNull()) \
            .select(jsonDF.login_name \
                    , jsonDF.commiter_id \
                    , jsonDF.email \
                    , jsonDF.email_company)
    else:
        authDF = jsonDF.select(jsonDF.login_name \
                               , jsonDF.commiter_id \
                               , jsonDF.email \
                               , jsonDF.email_company)
    authDF.write.jdbc(url=url_connect,
                      table=authorTable,
                      mode="append",
                      properties=db_properties)
    authDF.show()

    # Read the Authors table after insearting the new authors
    readAuthorTableDF = spark.read.jdbc(url=url_connect,
                                        table=authorTable,
                                        properties=db_properties)
    # Read the Commits Table from PostgreSQL DB into a Spark DataFrame Object before Update
    readCommitTableDF = spark.read.jdbc(url=url_connect,
                                        table=commitTable,
                                        properties=db_properties)
    # Create DataFrame by joining the DataFrame which is createded from the source JSON with the authors table contents
    # Do a InnerJoin with authors dbtable data frame to fetch only the records that have a commit_id in authors table
    commitDF = jsonDF.join(readAuthorTableDF,
                           jsonDF.commiter_id == readAuthorTableDF.commiter_id,
                           how='inner')

    from pyspark.sql import Row

    # Check if the Commits table is empty or not
    # If the table in the db is empty then insert the commits dataframe directly
    # If the table is not empty then check the last executed date in the commits db table
    # Now filter all the records with the current date as last executed datetime
    # The above step will make sure the process is idempotent.
    # Insert only the new author records into Commits DB table
    if len(readCommitTableDF.head(1)) > 0:
        maxDate = readCommitTableDF.orderBy(
            readCommitTableDF.creation_date.desc()).head(1)[0].creation_date
        commitDF = commitDF.filter(F.current_timestamp().cast(DateType()) != maxDate).select(jsonDF.sha \
                                                                                             , jsonDF.url \
                                                                                             , jsonDF.message \
                                                                                             , jsonDF.commit_date \
                                                                                             ,
                                                                                             readAuthorTableDF.author_id \
                                                                                             ,
                                                                                             readAuthorTableDF.creation_date)
    else:
        commitDF = commitDF.select(jsonDF.sha\
                                   , jsonDF.url\
                                   , jsonDF.message
                                   , jsonDF.commit_date\
                                   , readAuthorTableDF.author_id\
                                   , readAuthorTableDF.creation_date)
    commitDF.show()
    commitDF.write.jdbc(url=url_connect,
                        table=commitTable,
                        mode="append",
                        properties=db_properties)
示例#6
0
def task_2(data_io, product_data):
    # -----------------------------Column names--------------------------------
    # Inputs:
    salesRank_column = 'salesRank'
    categories_column = 'categories'
    asin_column = 'asin'
    # Outputs:
    category_column = 'category'
    bestSalesCategory_column = 'bestSalesCategory'
    bestSalesRank_column = 'bestSalesRank'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------

    ### category
    data = product_data.withColumn(
        category_column,
        F.when(F.col(categories_column)[0][0] == '',
               None).otherwise(F.col(categories_column)[0][0]))
    category_nulls = data.filter(F.col(category_column).isNull()).count()
    category_distinct = data.agg(F.countDistinct(
        F.col(category_column))).head()[0]

    ### salesRank and salesCategory
    key_and_values = data.select(
        asin_column, category_column,
        F.map_keys(salesRank_column)[0].alias(bestSalesCategory_column),
        F.map_values(salesRank_column)[0].alias(bestSalesRank_column))

    mean_of_salesRank = key_and_values.select(
        F.avg(F.col(bestSalesRank_column))).head()[0]
    variance_of_salesRank = key_and_values.select(
        F.variance(F.col(bestSalesRank_column))).head()[0]

    salesCategory_nulls = key_and_values.filter(
        F.col(bestSalesCategory_column).isNull()).count()
    salesCategory_distinct = key_and_values.agg(
        F.countDistinct(F.col(bestSalesCategory_column))).head()[0]

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'mean_bestSalesRank': None,
        'variance_bestSalesRank': None,
        'numNulls_category': None,
        'countDistinct_category': None,
        'numNulls_bestSalesCategory': None,
        'countDistinct_bestSalesCategory': None
    }
    # Modify res:

    res['count_total'] = data.count()
    res['mean_bestSalesRank'] = mean_of_salesRank
    res['variance_bestSalesRank'] = variance_of_salesRank
    res['numNulls_category'] = category_nulls
    res['countDistinct_category'] = category_distinct
    res['numNulls_bestSalesCategory'] = salesCategory_nulls
    res['countDistinct_bestSalesCategory'] = salesCategory_distinct

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_2')
    return res
def task_2(data_io, product_data):
    # -----------------------------Column names--------------------------------
    # Inputs:
    salesRank_column = 'salesRank'
    categories_column = 'categories'
    asin_column = 'asin'
    # Outputs:
    category_column = 'category'
    bestSalesCategory_column = 'bestSalesCategory'
    bestSalesRank_column = 'bestSalesRank'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------
    res_df = product_data.select(
        product_data.categories[0][0].alias(category_column),
        F.map_keys(product_data.salesRank)[0].alias(bestSalesCategory_column),
        F.map_values(
            product_data.salesRank)[0].alias(bestSalesRank_column)).replace(
                {'': None},
                subset=[
                    category_column, bestSalesCategory_column,
                    bestSalesRank_column
                ])

    stats = res_df.agg(
        F.count("*").alias('count_total'),
        F.avg(bestSalesRank_column).alias('mean_bestSalesRank'),
        F.variance(bestSalesRank_column).alias('variance_bestSalesRank'),
        F.sum(
            F.isnull(category_column).cast('int')).alias('numNulls_category'),
        F.countDistinct(res_df.category).alias('countDistinct_category'),
        F.sum(F.isnull(bestSalesCategory_column).cast('int')).alias(
            'numNulls_bestSalesCategory'),
        F.countDistinct(res_df.bestSalesCategory).alias(
            'countDistinct_bestSalesCategory')).head()

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'mean_bestSalesRank': None,
        'variance_bestSalesRank': None,
        'numNulls_category': None,
        'countDistinct_category': None,
        'numNulls_bestSalesCategory': None,
        'countDistinct_bestSalesCategory': None
    }
    # Modify res:

    res['count_total'] = int(stats[0])
    res['mean_bestSalesRank'] = float(stats[1])
    res['variance_bestSalesRank'] = float(stats[2])
    res['numNulls_category'] = int(stats[3])
    res['countDistinct_category'] = int(stats[4])
    res['numNulls_bestSalesCategory'] = int(stats[5])
    res['countDistinct_bestSalesCategory'] = int(stats[6])
    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_2')
    return res
df3=df.rdd.map(lambda x: \
    (x.name,x.properties["hair"],x.properties["eye"])) \
    .toDF(["name","hair","eye"])
df3.printSchema()
df3.show()

df.withColumn("hair",df.properties.getItem("hair")) \
  .withColumn("eye",df.properties.getItem("eye")) \
  .drop("properties") \
  .show()

df.withColumn("hair",df.properties["hair"]) \
  .withColumn("eye",df.properties["eye"]) \
  .drop("properties") \
  .show()

from pyspark.sql.functions import explode
df.select(df.name,explode(df.properties)).show()

from pyspark.sql.functions import map_keys
df.select(df.name,map_keys(df.properties)).show()

from pyspark.sql.functions import map_values
df.select(df.name,map_values(df.properties)).show()

#from pyspark.sql.functions import explode,map_keys
#keysDF = df.select(explode(map_keys(df.properties))).distinct()
#keysList = keysDF.rdd.map(lambda x:x[0]).collect()
#print(keysList)

def task_2(data_io, product_data):
    # -----------------------------Column names--------------------------------
    # Inputs:
    salesRank_column = 'salesRank'
    categories_column = 'categories'
    asin_column = 'asin'
    # Outputs:
    category_column = 'category'
    bestSalesCategory_column = 'bestSalesCategory'
    bestSalesRank_column = 'bestSalesRank'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------
    first_item_ = product_data['categories'][0][0]
    procesed_ = product_data.withColumn(category_column, first_item_)
    null_ = F.when(procesed_.category == '',
                   None).otherwise(procesed_.category)
    procesed_ = procesed_.withColumn(category_column, null_)
    map_key = F.map_keys('salesRank')[0]
    procesed_ = procesed_.withColumn('bestSalesCategory', map_key)
    map_value = F.map_values('salesRank')[0]
    procesed_ = procesed_.withColumn('bestSalesRank', map_value)
    count_total, mean_bestSalesRank, variance_bestSalesRank = procesed_.agg(F.count('asin'), \
      F.mean('bestSalesRank'), F.variance('bestSalesRank')).collect()[0]

    countDistinct_category = procesed_.filter(procesed_["category"] != '')
    countDistinct_category = countDistinct_category.groupBy("category")
    countDistinct_category = countDistinct_category.agg(
        F.countDistinct("category")).count()

    sales = procesed_.select('bestSalesCategory').filter( \
        procesed_.bestSalesCategory.isNotNull())

    numNulls_bestSalesCategory, temp =  procesed_.agg(F.sum((F.isnull(procesed_[bestSalesCategory_column])).cast("int")),\
                                                    F.sum((F.isnull(procesed_[bestSalesCategory_column])).cast("int"))).collect()[0]
    numNulls_category, temp =  procesed_.agg(F.sum((F.isnull(procesed_[category_column])).cast("int")),\
                                                    F.sum((F.isnull(procesed_[category_column])).cast("int"))).collect()[0]
    countDistinct_bestSalesCategory, temp = \
    sales.agg(F.countDistinct(procesed_.bestSalesCategory).alias('bestSalesCategory'),\
                 F.countDistinct(procesed_.bestSalesCategory).alias('bestSalesCategory')\
                ).collect()[0]

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'mean_bestSalesRank': None,
        'variance_bestSalesRank': None,
        'numNulls_category': None,
        'countDistinct_category': None,
        'numNulls_bestSalesCategory': None,
        'countDistinct_bestSalesCategory': None
    }
    # Modify res:
    res['count_total'] = count_total
    res['mean_bestSalesRank'] = mean_bestSalesRank
    res['variance_bestSalesRank'] = variance_bestSalesRank
    res['numNulls_category'] = numNulls_category
    res['countDistinct_category'] = countDistinct_category
    res['numNulls_bestSalesCategory'] = numNulls_bestSalesCategory
    res['countDistinct_bestSalesCategory'] = countDistinct_bestSalesCategory

    print(res)

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_2')
    return res