def main():
    print (f"""Getting average yearly prices per region for all""")
    appName = config['common']['appName']
    spark = s.spark_session(appName)
    sc = s.sparkcontext()
    spark = s.setSparkConfBQ(spark)
    lst = (spark.sql("SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")).collect()
    print("\nStarted at");uf.println(lst)
    wSpecY = Window().partitionBy(F.date_format('Date',"yyyy"), 'regionname')
    house_df = s.loadTableFromBQ(spark,config['GCPVariables']['sourceDataset'],config['GCPVariables']['sourceTable'])
    house_df.printSchema()
    house_df.show(2, False)

    print(f"""\nAnnual House prices per regions in GBP""")
    # Workout yearly aversge prices
    df2 = house_df. \
                    select( \
                          F.date_format('Date', 'yyyy').cast("Integer").alias('year') \
                        , 'regionname' \
                        , round(F.avg('averageprice').over(wSpecY)).alias('AVGPricePerYear') \
                        , round(F.avg('flatprice').over(wSpecY)).alias('AVGFlatPricePerYear') \
                        , round(F.avg('TerracedPrice').over(wSpecY)).alias('AVGTerracedPricePerYear') \
                        , round(F.avg('SemiDetachedPrice').over(wSpecY)).alias('AVGSemiDetachedPricePerYear') \
                        , round(F.avg('DetachedPrice').over(wSpecY)).alias('AVGDetachedPricePerYear')). \
                    distinct().orderBy('Date', asending=True)
    df2.show(20,False)
    s.writeTableToBQ(df2,"overwrite",config['GCPVariables']['targetDataset'],config['GCPVariables']['yearlyAveragePricesAllTable'])
    print(f"""created {config['GCPVariables']['yearlyAveragePricesAllTable']}""")
    lst = (spark.sql("SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")).collect()
    print("\nFinished at");uf.println(lst)
示例#2
0
def SendToBigQuery(df, batchId):

    if (len(df.take(1))) > 0:
        #df.printSchema()
        df.persist()
        # read from redis table
        spark_session = s.spark_session(config['common']['appName'])
        spark_session = s.setSparkConfBQ(spark_session)
        # read from BigQuery
        read_df = s.loadTableFromBQ(spark_session,
                                    config['MDVariables']['targetDataset'],
                                    config['MDVariables']['targetTable'])
        #read_df = s.loadTableFromRedis(spark_session, config['RedisVariables']['targetTable'], config['RedisVariables']['keyColumn'])
        # Write data to config['MDVariables']['targetTable'] in BigQuery
        # look for high value tickers
        for row in df.rdd.collect():
            rowkey = row.rowkey
            ticker = row.ticker
            price = row.price
            values = bigQueryAverages(ticker, price, read_df)
            Average = values["average"]
            standardDeviation = values["standardDeviation"]
            lower = values["lower"]
            upper = values["upper"]
            if lower is not None and upper is not None:
                hvTicker = priceComparison(ticker, price, lower, upper)
                if (hvTicker == 1):
                    writeHighValueData(df, rowkey)
        df.unpersist()
    else:
        print("DataFrame is empty")
示例#3
0
def extractHiveData():
    print(f"""Getting average yearly prices per region for all""")
    # read data through jdbc from Hive
    spark_session = s.spark_session(ctest['common']['appName'])
    tableName = config['GCPVariables']['sourceTable']
    fullyQualifiedTableName = config['hiveVariables']['DSDB'] + '.' + tableName
    print("reading from Hive table")
    house_df = s.loadTableFromHiveJDBC(spark_session, fullyQualifiedTableName)
    # sample data equally n rows from Kensington and Chelsea and n rows from City of Westminster
    num_rows = int(config['MysqlVariables']['read_df_rows'] / 2)
    house_df = house_df.filter(col(
        "regionname") == "Kensington and Chelsea").limit(num_rows).unionAll(
            house_df.filter(
                col("regionname") == "City of Westminster").limit(num_rows))
示例#4
0
def main():
    print(f"""Getting average yearly prices per region for all""")
    # read data through jdbc from Oracle

    appName = config['common']['appName']
    spark = s.spark_session(appName)
    sc = s.sparkcontext()
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nStarted at")
    uf.println(lst)
    wSpecY = Window().partitionBy(F.date_format('datetaken', "yyyy"),
                                  'regionname')
    tableName = config['OracleVariables']['sourceTable']
    fullyQualifiedTableName = config['OracleVariables'][
        'dbschema'] + '.' + tableName
    print("reading from Oracle table")
    house_df = s.loadTableFromOracleJDBC(spark, fullyQualifiedTableName)
    house_df.printSchema()
    house_df.show(5, False)
    print(f"""\nAnnual House prices per regions in GBP""")
    # Workout yearly aversge prices
    df2 = house_df. \
                    select( \
                          F.date_format('datetaken','yyyy').cast("Integer").alias('YEAR') \
                        , 'REGIONNAME' \
                        , round(F.avg('averageprice').over(wSpecY)).alias('AVGPRICEPERYEAR') \
                        , round(F.avg('flatprice').over(wSpecY)).alias('AVGFLATPRICEPERYEAR') \
                        , round(F.avg('TerracedPrice').over(wSpecY)).alias('AVGTERRACEDPRICEPERYEAR') \
                        , round(F.avg('SemiDetachedPrice').over(wSpecY)).alias('AVGSDPRICEPRICEPERYEAR') \
                        , round(F.avg('DetachedPrice').over(wSpecY)).alias('AVGDETACHEDPRICEPERYEAR')). \
                    distinct().orderBy('datetaken', asending=True)
    df2.printSchema()
    df2.show(20, False)
    # write to Oracle table, all uppercase not mixed case and column names <= 30 characters in version 12.1
    s.writeTableToOracle(
        df2, "overwrite", config['OracleVariables']['dbschema'],
        config['OracleVariables']['yearlyAveragePricesAllTable'])
    print(
        f"""created {config['OracleVariables']['yearlyAveragePricesAllTable']}"""
    )
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)
示例#5
0
def sendToControl(dfnewtopic, batchId):
    if (len(dfnewtopic.take(1))) > 0:
        #print(f"""newtopic batchId is {batchId}""")
        dfnewtopic.show(100, False)
        queue = dfnewtopic.select(col("queue")).collect()[0][0]
        status = dfnewtopic.select(col("status")).collect()[0][0]
        #print(f"""{queue}, {status}""")
        if ((queue == config['MDVariables']['topic']) & (status == 'false')):
            spark_session = s.spark_session(config['common']['appName'])
            active = spark_session.streams.active
            for e in active:
                #print(e)
                name = e.name
                if (name == config['MDVariables']['topic']):
                    print(f"""Terminating streaming process {name}""")
                    e.stop()
    else:
        print("DataFrame newtopic is empty")
def readSourceData():
    # read source table
    table = ctest['statics']['dbschema'] + '.' + ctest['statics']['sourceTable']
    spark_session = s.spark_session(ctest['common']['appName'])
    # Read the test table
    try:
        read_df = spark_session.read. \
            format("jdbc"). \
            option("url", test_url). \
            option("driver", ctest['statics']['driver']). \
            option("dbtable", table). \
            option("user", ctest['statics']['user']). \
            option("password", ctest['statics']['password']). \
            option("fetchsize", ctest['statics']['fetchsize']). \
            load()
        return read_df
    except Exception as e:
        print(f"""{e}, quitting""")
        sys.exit(1)
def readSavedData():
    # read target table to tally the result
    table = ctest['statics']['dbschema'] + '.' + ctest['statics'][
        'yearlyAveragePricesAllTable']
    spark_session = s.spark_session(ctest['common']['appName'])
    try:
        readSavedData_df = spark_session.read. \
            format("jdbc"). \
            option("url", test_url). \
            option("driver", ctest['statics']['driver']). \
            option("dbtable", table). \
            option("user", ctest['statics']['user']). \
            option("password", ctest['statics']['password']). \
            option("fetchsize", ctest['statics']['fetchsize']). \
            load()
        return readSavedData_df
    except Exception as e:
        print(f"""{e}, quitting""")
        sys.exit(1)
示例#8
0
class S1:
    appName = "app1"
    spark = s.spark_session(appName)
    sc = s.sparkcontext()
    df = spark.createDataFrame([("a", 0), ("a", 1), ("b", 30), ("b", -50)],
                               ["group", "power"])

    def below_threshold(threshold, group="group", power="power"):
        @pandas_udf("struct<group: string, below_threshold: boolean>",
                    PandasUDFType.GROUPED_MAP)
        def below_threshold_(df):
            df = pd.DataFrame(
                df.groupby(group).apply(lambda x:
                                        (x[power] < threshold).any()))
            df.reset_index(inplace=True, drop=False)
            return df

        return below_threshold_

    df.groupBy("group").apply(below_threshold(-40)).show()
示例#9
0
def main():
    regionname = sys.argv[1]  ## parameter passed
    short = regionname.replace(" ", "").lower()
    print(f"""Creating Yearly percentage tables for {regionname}""")
    appName = config['common']['appName']
    spark = s.spark_session(appName)
    sc = s.sparkcontext()
    #    # Get data from BigQuery table
    tableName = "yearlyaveragepricesAllTable"
    start_date = "2010"
    end_date = "2020"
    yearTable = f"""{config['GCPVariables']['percentYearlyHousePriceChange']}_{short}"""
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    spark = s.setSparkConfBQ(spark)
    read_df = s.loadTableFromBQ(
        spark, config['GCPVariables']['targetDataset'],
        config['GCPVariables']['yearlyAveragePricesAllTable'])
    house_df = read_df.filter(
        (col("Year").between(f'{start_date}', f'{end_date}'))
        & (lower(col("regionname")) == f'{regionname}'.lower()))
    wSpecPY = Window().orderBy('regionname', 'Year')
    df_lagY = house_df.withColumn(
        "prev_year_value",
        F.lag(house_df['AVGPricePerYear']).over(wSpecPY))
    resultY = df_lagY.withColumn('percent_change', F.when(F.isnull(house_df.AVGPricePerYear - df_lagY.prev_year_value), 0). \
                             otherwise(F.round(((house_df.AVGPricePerYear - df_lagY.prev_year_value) * 100.) / df_lagY.prev_year_value, 1)))
    print(f"""\nYear House price changes in {regionname} in GBP""")
    rsY = resultY.select('Year', 'AVGPricePerYear', 'prev_year_value',
                         'percent_change')
    rsY.show(36, False)
    s.writeTableToBQ(rsY, "overwrite", config['GCPVariables']['targetDataset'],
                     yearTable)
    print(f"""Created {yearTable}""")
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)
示例#10
0
def main():
    print(f"""Getting average yearly prices per region for all""")
    regionname = sys.argv[1]  ## parameter passed
    short = regionname.replace(" ", "").lower()
    print(f"""Getting Yearly percentages tables for {regionname}""")
    appName = "ukhouseprices"
    spark = s.spark_session(appName)
    # Get data from BigQuery table
    tableName = f"""{config['GCPVariables']['percentYearlyHousePriceChange']}_{short}"""
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    spark = s.setSparkConfBQ(spark)
    print("\nStarted at")
    uf.println(lst)
    read_df = s.loadTableFromBQ(spark, config['GCPVariables']['targetDataset'],
                                tableName)
    summary_df = read_df.select(
        col("Year"),
        col("percent_change").alias("PercentYearlyChange"))
    p_df = summary_df.toPandas()
    print(p_df)
    p_df.plot(kind='bar', stacked=False, x='Year', y=['PercentYearlyChange'])
    plt.xlabel("Year", fontdict=config['plot_fonts']['font'])
    plt.ylabel("Annual Percent Property Price change",
               fontdict=config['plot_fonts']['font'])
    plt.title(
        f"""Property price fluctuations in {regionname} for the past 10 years """,
        fontdict=config['plot_fonts']['font'])
    plt.margins(0.15)
    plt.subplots_adjust(bottom=0.25)
    plt.show()
    plt.close()
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)
示例#11
0
from pyspark.sql.functions import lag
from sparkutils import sparkstuff as s
from misc import usedFunctions as uf
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
import pandas as pd
from pandas.plotting import scatter_matrix
import locale
locale.setlocale(locale.LC_ALL, 'en_GB')
try:
    import variables as v
except ModuleNotFoundError:
    from conf import parameters as v

appName = "ukhouseprices"
spark = s.spark_session(appName)
spark.sparkContext._conf.setAll(v.settings)
sc = s.sparkcontext()
#
# Get data from Hive table
regionname = "Kensington and Chelsea"
tableName = "ukhouseprices"
fullyQualifiedTableName = v.DSDB + '.' + tableName
summaryTableName = v.DSDB + '.' + 'summary'
start_date = "2010-01-01"
end_date = "2020-01-01"
lst = (spark.sql(
    "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
       ).collect()
print("\nStarted at")
uf.println(lst)
示例#12
0
def main():
    regionname = sys.argv[1]  ## parameter passed
    short = regionname.replace(" ", "").lower()
    print(f"""Getting plots for {regionname}""")
    appName = "ukhouseprices"
    spark = s.spark_session(appName)
    sc = s.sparkcontext()
    #
    # Get data from BigQuery table
    summaryTableName = v.fullyQualifiedoutputTableId
    start_date = "201001"
    end_date = "202001"
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nStarted at")
    uf.println(lst)
    # Model predictions
    spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
    # read data from the Bigquery table summary
    print("\nreading data from " + v.fullyQualifiedoutputTableId)

    summary_df = spark.read. \
                  format("bigquery"). \
                  option("credentialsFile",v.jsonKeyFile). \
                  option("project", v.projectId). \
                  option("parentProject", v.projectId). \
                  option("dataset", v.targetDataset). \
                  option("table", v.targetTable). \
        load()
    df_10 = summary_df.filter(F.col("Date").between(f'{start_date}', f'{end_date}')). \
        select(F.date_format('Date',"yyyyMM").cast("Integer").alias("date"), 'flatprice', 'terracedprice', 'semidetachedprice', 'detachedprice')
    df_10.printSchema()
    print(df_10.toPandas().columns.tolist())
    p_dfm = df_10.toPandas()  # converting spark DF to Pandas DF
    # Non-Linear Least-Squares Minimization and Curve Fitting

    # Define model to be Lorentzian and deploy it
    model = LorentzianModel()
    n = len(p_dfm.columns)
    for i in range(n):
        if p_dfm.columns[i] != "date":  # yyyyMM is x axis in integer
            # it goes through the loop and plots individual average curves one by one and then prints a report for each y value
            vcolumn = p_dfm.columns[i]
            print(vcolumn)
            params = model.guess(p_dfm[vcolumn], x=p_dfm['date'])
            result = model.fit(p_dfm[vcolumn], params, x=p_dfm['date'])
            result.plot_fit()
            plt.margins(0.15)
            plt.subplots_adjust(bottom=0.25)
            plt.xticks(rotation=90)
            plt.xlabel("year/month", fontdict=v.font)
            plt.text(0.35,
                     0.45,
                     "Best-fit based on Non-Linear Lorentzian Model",
                     transform=plt.gca().transAxes,
                     color="grey",
                     fontsize=9)
            plt.xlim(left=200900)
            plt.xlim(right=202100)
            if vcolumn == "flatprice": property = "Flat"
            if vcolumn == "terracedprice": property = "Terraced"
            if vcolumn == "semidetachedprice": property = "semi-detached"
            if vcolumn == "detachedprice": property = "detached"
            plt.ylabel(f"""{property} house prices in millions/GBP""",
                       fontdict=v.font)
            plt.title(
                f"""Monthly {property} prices fluctuations in {regionname}""",
                fontdict=v.font)
            print(result.fit_report())
            plt.show()
            plt.close()

    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)
示例#13
0
    def main():
        appName = "ukhouseprices"
        spark = s.spark_session(appName)
        spark.sparkContext._conf.setAll(v.settings)
        sc = s.sparkcontext()
        #
        # Get data from Hive table
        regionname = "Kensington and Chelsea"
        tableName = "ukhouseprices"
        fullyQualifiedTableName = v.DSDB + "." + tableName
        summaryTableName = v.DSDB + "." + "summary"
        start_date = "2010"
        end_date = "2020"
        lst = (spark.sql(
            "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') "
        )).collect()
        print("\nStarted at")
        uf.println(lst)
        # Model predictions
        spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
        #summary_df = spark.sql(f"""SELECT cast(date_format(datetaken, "yyyyMM") as int) as datetaken, flatprice, terracedprice, semidetachedprice, detachedprice FROM {summaryTableName}""")
        summary_df = spark.sql(
            f"""SELECT cast(Year as int) as year, AVGFlatPricePerYear, AVGTerracedPricePerYear, AVGSemiDetachedPricePerYear, AVGDetachedPricePerYear FROM {v.DSDB}.yearlyhouseprices"""
        )
        df_10 = summary_df.filter(
            col("year").between(f'{start_date}', f'{end_date}'))
        print(df_10.toPandas().columns.tolist())

        # show pandas column list ['Year', 'AVGPricePerYear', 'AVGFlatPricePerYear', 'AVGTerracedPricePerYear', 'AVGSemiDetachedPricePerYear', 'AVGDetachedPricePerYear']
        p_dfm = df_10.toPandas()  # converting spark DF to Pandas DF
        data = p_dfm.values

        # Non-Linear Least-Squares Minimization and Curve Fitting
        model = LorentzianModel()
        n = len(p_dfm.columns)
        for i in range(n):
            if p_dfm.columns[i] != 'year':  # year is x axis in integer
                # it goes through the loop and plots individual average curves one by one and then prints a report for each y value
                vcolumn = p_dfm.columns[i]
                print(vcolumn)
                params = model.guess(p_dfm[vcolumn], x=p_dfm['year'])
                result = model.fit(p_dfm[vcolumn], params, x=p_dfm['year'])
                result.plot_fit()

                # do linear regression here
                # Prepare data for Machine Learning.And we need two columns only — features and label(p_dfm.columns[i]]):
                inputCols = ['year']
                vectorAssembler = VectorAssembler(inputCols=inputCols,
                                                  outputCol='features')
                vhouse_df = vectorAssembler.transform(df_10)
                vhouse_df = vhouse_df.select(
                    ['features', 'AVGFlatPricePerYear'])
                vhouse_df.show(20)
                if vcolumn == "AVGFlatPricePerYear":
                    plt.xlabel("Year", fontdict=v.font)
                    plt.ylabel("Flat house prices in millions/GBP",
                               fontdict=v.font)
                    plt.title(
                        f"""Flat price fluctuations in {regionname} for the past 10 years """,
                        fontdict=v.font)
                    plt.text(0.35,
                             0.45,
                             "Best-fit based on Non-Linear Lorentzian Model",
                             transform=plt.gca().transAxes,
                             color="grey",
                             fontsize=10)
                    print(result.fit_report())
                    plt.xlim(left=2009)
                    plt.xlim(right=2022)
                    plt.show()
                    plt.close()
                elif vcolumn == "AVGTerracedPricePerYear":
                    plt.xlabel("Year", fontdict=v.font)
                    plt.ylabel("Terraced house prices in millions/GBP",
                               fontdict=v.font)
                    plt.title(
                        f"""Terraced house price fluctuations in {regionname} for the past 10 years """,
                        fontdict=v.font)
                    plt.text(0.35,
                             0.45,
                             "Best-fit based on Non-Linear Lorentzian Model",
                             transform=plt.gca().transAxes,
                             color="grey",
                             fontsize=10)
                    print(result.fit_report())
                    plt.show()
                    plt.close()
                elif vcolumn == "AVGSemiDetachedPricePerYear":
                    plt.xlabel("Year", fontdict=v.font)
                    plt.ylabel("semi-detached house prices in millions/GBP",
                               fontdict=v.font)
                    plt.title(
                        f"""semi-detached house price fluctuations in {regionname} for the past 10 years """,
                        fontdict=v.font)
                    plt.text(0.35,
                             0.45,
                             "Best-fit based on Non-Linear Lorentzian Model",
                             transform=plt.gca().transAxes,
                             color="grey",
                             fontsize=10)
                    print(result.fit_report())
                    plt.show()
                    plt.close()
                elif vcolumn == "AVGDetachedPricePerYear":
                    plt.xlabel("Year", fontdict=v.font)
                    plt.ylabel("detached house prices in millions/GBP",
                               fontdict=v.font)
                    plt.title(
                        f"""detached house price fluctuations in {regionname} for the past 10 years """,
                        fontdict=v.font)
                    plt.text(0.35,
                             0.45,
                             "Best-fit based on Non-Linear Lorentzian Model",
                             transform=plt.gca().transAxes,
                             color="grey",
                             fontsize=10)
                    print(result.fit_report())
                    plt.show()
                    plt.close()

        p_df = df_10.select('AVGFlatPricePerYear', 'AVGTerracedPricePerYear',
                            'AVGSemiDetachedPricePerYear',
                            'AVGDetachedPricePerYear').toPandas().describe()
        print(p_df)
        #axs = scatter_matrix(p_df, figsize=(10, 10))
        # Describe returns a DF where count,mean, min, std,max... are values of the index
        y = p_df.loc[['min', 'mean', 'max']]
        #y = p_df.loc[['averageprice', 'flatprice']]
        ax = y.plot(linewidth=2, colormap='jet', marker='.', markersize=20)
        plt.grid(True)
        plt.xlabel("UK House Price Index, January 2020", fontdict=v.font)
        plt.ylabel("Property Prices in millions/GBP", fontdict=v.font)
        plt.title(
            f"""Property price fluctuations in {regionname} for the past 10 years """,
            fontdict=v.font)
        plt.legend(p_df.columns)
        plt.show()
        plt.close()
        lst = (spark.sql(
            "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') "
        )).collect()
        print("\nFinished at")
        uf.println(lst)
def main():
    regionname = sys.argv[1]  ## parameter passed
    short = regionname.replace(" ", "").lower()
    appName = config['common']['appName']
    spark = s.spark_session(appName)
    spark = s.setSparkConfBQ(spark)
    # Get data from BigQuery table
    start_date = "201001"
    end_date = "202001"
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nStarted at")
    uf.println(lst)
    # Model predictions
    read_df = s.loadTableFromBQ(spark, config['GCPVariables']['sourceDataset'],
                                config['GCPVariables']['sourceTable'])
    df_10 = read_df.filter(F.date_format('Date',"yyyyMM").cast("Integer").between(f'{start_date}', f'{end_date}') & (lower(col("regionname"))== f'{regionname}'.lower())). \
            select(F.date_format('Date',"yyyyMM").cast("Integer").alias("Date") \
                 , round(col("flatprice")).alias("flatprice") \
                 , round(col("terracedprice")).alias("terracedprice")
                 , round(col("semidetachedprice")).alias("semidetachedprice")
                 , round(col("detachedprice").alias("detachedprice")))
    print(df_10.toPandas().columns.tolist())
    p_dfm = df_10.toPandas()  # converting spark DF to Pandas DF

    # Non-Linear Least-Squares Minimization and Curve Fitting
    # Define model to be Lorentzian and depoly it
    model = LorentzianModel()
    n = len(p_dfm.columns)
    for i in range(n):
        if (p_dfm.columns[i] != 'Date'):  # yyyyMM is x axis in integer
            # it goes through the loop and plots individual average curves one by one and then prints a report for each y value
            vcolumn = p_dfm.columns[i]
            print(vcolumn)
            params = model.guess(p_dfm[vcolumn], x=p_dfm['Date'])
            result = model.fit(p_dfm[vcolumn], params, x=p_dfm['Date'])
            # plot the data points, initial fit and the best fit
            plt.plot(p_dfm['Date'], p_dfm[vcolumn], 'bo', label='data')
            plt.plot(p_dfm['Date'],
                     result.init_fit,
                     'k--',
                     label='initial fit')
            plt.plot(p_dfm['Date'], result.best_fit, 'r-', label='best fit')
            plt.legend(loc='upper left')
            plt.xlabel("Year/Month", fontdict=config['plot_fonts']['font'])
            plt.text(0.35,
                     0.55,
                     "Fit Based on Non-Linear Lorentzian Model",
                     transform=plt.gca().transAxes,
                     color="grey",
                     fontsize=9)
            if vcolumn == "flatprice": property = "Flat"
            if vcolumn == "terracedprice": property = "Terraced"
            if vcolumn == "semidetachedprice": property = "semi-detached"
            if vcolumn == "detachedprice": property = "detached"
            plt.ylabel(f"""{property} house prices in millions/GBP""",
                       fontdict=config['plot_fonts']['font'])
            plt.title(
                f"""Monthly {property} price fluctuations in {regionname}""",
                fontdict=config['plot_fonts']['font'])
            plt.xlim(200901, 202101)
            print(result.fit_report())
            plt.show()
            plt.close()
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)
def main():
    appName = "DS"
    spark = s.spark_session(appName)
    sc = s.sparkcontext()

    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nStarted at")
    uf.println(lst)

    tmp_bucket = "tmp_storage_bucket/tmp"

    # Set the temporary storage location
    spark.conf.set("temporaryGcsBucket", v.tmp_bucket)
    spark.sparkContext.setLogLevel("ERROR")

    HadoopConf = sc._jsc.hadoopConfiguration()
    HadoopConf.set("fs.gs.impl",
                   "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
    HadoopConf.set("fs.AbstractFileSystem.gs.impl",
                   "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

    # needed filters

    start_date = "2010-01-01"
    end_date = "2020-01-01"

    spark.conf.set("GcpJsonKeyFile", v.jsonKeyFile)
    spark.conf.set("BigQueryProjectId", v.projectId)
    spark.conf.set("BigQueryDatasetLocation", v.datasetLocation)
    spark.conf.set("google.cloud.auth.service.account.enable", "true")
    spark.conf.set("fs.gs.project.id", v.projectId)
    spark.conf.set("fs.gs.impl",
                   "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
    spark.conf.set("fs.AbstractFileSystem.gs.impl",
                   "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
    spark.conf.set("temporaryGcsBucket", v.tmp_bucket)

    sqltext = ""
    from pyspark.sql.window import Window

    # read data from the Bigquery table in staging area
    print("\nreading data from " + v.projectId + ":" + v.inputTable)

    read_df = spark.read. \
                  format("bigquery"). \
                  option("credentialsFile",v.jsonKeyFile). \
                  option("project", v.projectId). \
                  option("parentProject", v.projectId). \
                  option("dataset", v.targetDataset). \
                  option("table", v.targetTable). \
                  option("temporaryGcsBucket", v.tmp_bucket). \
        load()
    summary_df == read_df.filter(
        (col("Year").between(f'{start_date}', f'{end_date}'))
        & (lower(col("regionname")) == f'{regionname}'.lower()))
    summary_df.printSchema()
    rows = summary_df.count()
    print("Total number of rows for Kensington and Chelsea is ", rows)
    wSpecY = Window().partitionBy(F.date_format('date', "yyyy"))
    df2 = summary_df. \
                    select( \
                          F.date_format(F.col("date"),'yyyy').alias('Year') \
                        , F.round(F.avg(F.col("averageprice")).over(wSpecY)).alias('AVGPricePerYear') \
                        , F.round(F.avg('flatprice').over(wSpecY)).alias('AVGFlatPricePerYear') \
                        , F.round(F.avg('TerracedPrice').over(wSpecY)).alias('AVGTerracedPricePerYear') \
                        , F.round(F.avg('SemiDetachedPrice').over(wSpecY)).alias('AVGSemiDetachedPricePerYear') \
                        , F.round(F.avg('DetachedPrice').over(wSpecY)).alias('AVGDetachedPricePerYear')). \
                    distinct().orderBy('date', asending=True)
    df2.show(10, False)
    # Save the result set to a BigQuery table. Table is created if it does not exist
    print(f"""\nsaving data to {v.DSDB}.yearlyhouseprices""")
    df2. \
        write. \
        format("bigquery"). \
        option("temporaryGcsBucket", v.tmp_bucket).\
        mode("overwrite"). \
        option("table", "DS.yearlyhouseprices"). \
        save()
    """
    summary_df. \
    write. \
    format("bigquery"). \
    mode("overwrite"). \
    option("table", v.fullyQualifiedoutputTableId). \
    option("temporaryGcsBucket", v.tmp_bucket). \
    save()
    """

    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)
示例#16
0
    def main():
        appName = "app1"
        spark = s.spark_session(appName)
        spark.sparkContext._conf.setAll(v.settings)
        sc = s.sparkcontext()
        print(sc.getConf().getAll())
        lst = (spark.sql(
            "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') "
        )).collect()
        print("\nStarted at")
        uf.println(lst)

        numRows = 10  ## do in increment of 50K rows otherwise you blow up driver memory!
        #
        ## Check if table exist otherwise create it

        rows = 0
        sqltext = ""
        if (spark.sql(f"""SHOW TABLES IN {v.DB} like '{v.tableName}'""").count(
        ) == 1):
            spark.sql(
                f"""ANALYZE TABLE {v.fullyQualifiedTableName} compute statistics"""
            )
            rows = spark.sql(
                f"""SELECT COUNT(1) FROM {v.fullyQualifiedTableName}"""
            ).collect()[0][0]
            print("number of rows is ", rows)
        else:
            print(
                f"\nTable {v.fullyQualifiedTableName} does not exist, creating table "
            )
            sqltext = f"""
    CREATE TABLE {v.DB}.{v.tableName}(
    ID INT
    , CLUSTERED INT
    , SCATTERED INT
    , RANDOMISED INT
    , RANDOM_STRING VARCHAR(50)
    , SMALL_VC VARCHAR(50)
    , PADDING  VARCHAR(4000)
    )
    STORED AS PARQUET
    """
            spark.sql(sqltext)

        start = 0
        if (rows == 0):
            start = 1
            maxID = 0
        else:
            maxID = spark.sql(
                f"SELECT MAX(id) FROM {v.fullyQualifiedTableName}").collect(
                )[0][0]
        start = maxID + 1
        end = start + numRows - 1
        print("starting at ID = ", start, ",ending on = ", end)
        Range = range(start, end + 1)
        ## This traverses through the Range and increment "x" by one unit each time, and that x value is used in the code to generate random data through Python functions in a class

        rdd = sc.parallelize(Range). \
                 map(lambda x: (x, uf.clustered(x,numRows), \
                                   uf.scattered(x,numRows), \
                                   uf.randomised(x, numRows), \
                                   uf.randomString(50), \
                                   uf.padString(x," ",50), \
                                   uf.padSingleChar("x",4000)))
        df = rdd.toDF(). \
             withColumnRenamed("_1","ID"). \
             withColumnRenamed("_2", "CLUSTERED"). \
             withColumnRenamed("_3", "SCATTERED"). \
             withColumnRenamed("_4", "RANDOMISED"). \
             withColumnRenamed("_5", "RANDOM_STRING"). \
             withColumnRenamed("_6", "SMALL_VC"). \
             withColumnRenamed("_7", "PADDING")
        df.write.mode("overwrite").saveAsTable("pycharm.ABCD")
        df.printSchema()
        df.explain()
        df.createOrReplaceTempView("tmp")
        sqltext = f"""
    INSERT INTO TABLE {v.fullyQualifiedTableName}
    SELECT
            ID
          , CLUSTERED
          , SCATTERED
          , RANDOMISED
          , RANDOM_STRING
          , SMALL_VC
          , PADDING
    FROM tmp
    """
        spark.sql(sqltext)
        spark.sql(
            f"SELECT MIN(id) AS minID, MAX(id) AS maxID FROM {v.fullyQualifiedTableName}"
        ).show(n=20, truncate=False, vertical=False)
        ##sqlContext.sql("""SELECT * FROM pycharm.randomDataPy ORDER BY id""").show(n=20,truncate=False,vertical=False)
        lst = (spark.sql(
            "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') "
        )).collect()
        print("\nFinished at")
        uf.println(lst)

        spark.sql("show databases").show()
def main():
    appName = config['common']['appName']
    spark = s.spark_session(appName)
    sc = s.sparkcontext()
    spark = s.setSparkConfBQ(spark)
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nStarted at")
    uf.println(lst)
    print(
        f"""Reading from parquet file {config['ParquetVariables']['sourceSmall']}"""
    )
    # read from the source file
    currentSnapshot = spark.read.load(
        config['ParquetVariables']['sourceSmall'])
    currentSnapshot.printSchema()
    #currentSnapshot.show()
    print(f"""\nRows in source file is""", currentSnapshot.count())
    print(currentSnapshot.rdd.getStorageLevel())
    currentSnapshot = currentSnapshot.repartition(5)
    print(currentSnapshot.rdd.getStorageLevel())
    # read from delta files
    deltaFile = "gs://etcbucket/randomdata/staging/randomdatapy_208150201_208150210"
    newAddedDeltaFiles = spark.read.load(deltaFile)
    # check missing records with source file
    # find out IDs that do not exist in source
    newAddedDeltaFiles.createOrReplaceTempView("newAddedDeltaFiles")
    currentSnapshot.createOrReplaceTempView("currentSnapshot")
    sqltext = """SELECT
                     newAddedDeltaFiles.ID
                   , newAddedDeltaFiles.CLUSTERED
                   , newAddedDeltaFiles.SCATTERED
                   , newAddedDeltaFiles.RANDOMISED
                   , newAddedDeltaFiles.RANDOM_STRING
                   , newAddedDeltaFiles.SMALL_VC
                   , newAddedDeltaFiles.PADDING 
                 FROM newAddedDeltaFiles 
                 LEFT OUTER JOIN currentSnapshot ON newAddedDeltaFiles.ID = currentSnapshot.ID
                 WHERE currentSnapshot.ID IS NULL ORDER BY newAddedDeltaFiles.ID"""
    print(f"""\nRows in deltafiles that do not exist in source file""",
          currentSnapshot.count())
    missingRows = spark.sql(sqltext)
    newSnapshot = currentSnapshot.union(missingRows)
    print(newSnapshot.orderBy(col("ID")).show(10000))
    sys.exit()
    #spark.sql(sqltext).write.mode(saveMode)
    print(
        f"""Writing to parquet file {config['ParquetVariables']['targetLocation']}"""
    )
    df2.write.mode(config['ParquetVariables']['overwrite']).parquet(
        config['ParquetVariables']['targetLocation'])
    df3 = spark.read.load(config['ParquetVariables']['targetLocation'])
    print(
        f"""Reading from parquet file {config['ParquetVariables']['targetLocation']}"""
    )
    print(f"""\nRows in target table is""", df3.count())
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)
示例#18
0
class Sales:

    appName = "sales"
    spark = s.spark_session(appName)

    settings = [("hive.exec.dynamic.partition", "true"),
                ("hive.exec.dynamic.partition.mode", "nonstrict"),
                ("spark.sql.orc.filterPushdown", "true"),
                ("hive.msck.path.validation", "ignore"),
                ("spark.sql.caseSensitive", "true"),
                ("spark.speculation", "false"),
                ("hive.metastore.authorization.storage.checks", "false"),
                ("hive.metastore.client.connect.retry.delay", "5s"),
                ("hive.metastore.client.socket.timeout", "1800s"),
                ("hive.metastore.connect.retries", "12"),
                ("hive.metastore.execute.setugi", "false"),
                ("hive.metastore.failure.retries", "12"),
                ("hive.metastore.schema.verification", "false"),
                ("hive.metastore.schema.verification.record.version", "false"),
                ("hive.metastore.server.max.threads", "100000"),
                ("hive.metastore.authorization.storage.checks",
                 "/apps/hive/warehouse"), ("hive.stats.autogather", "true")]
    spark.sparkContext._conf.setAll(settings)
    sc = s.sparkcontext()
    #print(sc.getConf().getAll())
    hivecontext = s.hivecontext()
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nStarted at")
    uf.println(lst)

    rows = spark.sql(
        f"""SELECT COUNT(1) FROM {v.DB2}.{v.table2}""").collect()[0][0]
    sqltext = f"""
  SELECT
          rs.Customer_ID
        , rs.Number_of_orders
        , rs.Total_customer_amount
        , rs.Average_order
        , rs.Standard_deviation
        , rs.mystddev
  FROM
  (
           SELECT cust_id AS Customer_ID
        ,  COUNT(amount_sold) AS Number_of_orders
        ,  SUM(amount_sold) AS Total_customer_amount
        ,  AVG(amount_sold) AS Average_order
        ,  STDDEV(amount_sold) AS Standard_deviation
        ,  SQRT((SUM(POWER(AMOUNT_SOLD,2))-(COUNT(1)*POWER(AVG(AMOUNT_SOLD),2)))/(COUNT(1)-1)) AS mystddev
           FROM {v.DB2}.{v.table2}
           GROUP BY cust_id
           HAVING SUM(amount_sold) > 94000
           AND AVG(amount_sold) < STDDEV(amount_sold)
  ) rs
  ORDER BY
          3 DESC
  """
    spark.sql(sqltext).show(1000, False)
    df = spark.sql(sqltext)
    df.printSchema()
    lst = (spark.sql(
        "SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")
           ).collect()
    print("\nFinished at")
    uf.println(lst)