예제 #1
0
 def hasTable(self, sqlc):
     # Check if the table exists
     # There must be a better way than this
     pg = postgres.PostgresConnector()
     qr = "(SELECT 1 FROM pg_tables WHERE tablename='" + self.tableName + "') AS wtf"
     df = pg.read(sqlc, table=qr)
     return df.count() == 1
예제 #2
0
 def writeResults(self, prefix):
     pgTableName = prefix + self.year + '_' + self.month + '_' + 'st' + str(
         self.station)
     conn = postgres.PostgresConnector()
     conn.write(self.hr,
                pgTableName,
                glb('pgWriteMode'),
                db='taxi_aggregates')
예제 #3
0
 def readTable(self, sqlc):
     # To speed things up, define partition scheme here:
     mStart = self.year + '-' + self.month + '-01 00:00:00'
     mEnd    = str(int(self.year)+(int(self.month))/12) + '-' + \
               str((int(self.month)+1)%12).zfill(2).replace('00','12') + '-01 00:00:00'
     lb = dtt.strToTimeStamp(mStart) - 3600 * 24  # from 1 day prior
     ub = dtt.strToTimeStamp(mEnd) + 3600 * 24  # upto 1 day after
     pg = postgres.PostgresConnector()
     qr = "(SELECT * FROM " + self.tableName + ") AS wtf"
     self.df = pg.read(sqlc, table=qr, numPartitions=32, column="pUTimeStamp", \
                       lowerBound=lb, upperBound=ub)
 def writeToPostgres(self, prefix):
     # There are two different kind of schemas:
     # Prior to 2017, use coordinates; after, location ID
     if 'pULocId' in self.ylwTaxi.columns:
         keepCols = glb('pgKeepCols1')
     else:
         keepCols = glb('pgKeepCols2')
     dropCols = [clm for clm in self.ylwTaxi.columns if clm not in keepCols]
     for clm in dropCols:
         self.ylwTaxi = self.ylwTaxi.drop(clm)
     self.ylwTaxi = self.ylwTaxi.select(keepCols)
     self.pgTableName = prefix + '_' + self.year + '_' + self.month
     connector = postgres.PostgresConnector()
     connector.write(self.ylwTaxi, self.pgTableName, glb('pgWriteMode'))
예제 #5
0
    def WriteTables(self, processed_df):

        # Get list of libraries from S3 for which you want activity trends
        libinfo_df = self.spark.read.csv(
            "s3a://gauravdatabeamdata/LibraryInfo.csv",
            header=True,
            multiLine=True)
        libraries_list = libinfo_df.select(libinfo_df.Libraries).collect()

        liblist = []
        for row in libraries_list:
            liblist.append(str(row.Libraries))
        collocatedlibs = GetCollocatedLibraries()
        libs_indandcoll = collocatedlibs.GetLibraryPairs(liblist)

        print("Getting postgre connector..............................")
        connector = postgres.PostgresConnector()

        for item in libs_indandcoll:
            print(item +
                  '....................................................')

        for lib_ind_pair in libs_indandcoll:

            # pick out libraries which exist in processed dataframe
            lib_df = processed_df.where(
                processed_df.library == lib_ind_pair).select(
                    "datetime", "lib_counts")

            # save datetime(year-month), lib_counts(users) in a table for each library
            if len(lib_df.head(1)) > 0:
                print("Saving table %s into Postgres........................" %
                      lib_ind_pair)
                self.write_to_postgres(lib_df, lib_ind_pair, connector)
            else:
                continue
예제 #6
0
    def process_stream(self, rdd):

        if rdd.isEmpty():
            print("no incoming data")

        else:
	    """convert the data from rdd to dataframes"""
            data_frame = rdd.toDF().cache()
	    """calculating the mean of an array""""
            array_mean = udf(lambda x: float(np.mean(x)), FloatType())
	    """calculating the square of each element in an array"""
            def square_list(array_list):
                return [float(val)**2 for val in array_list]
            square_list_udf = udf(lambda y: square_list(y), ArrayType(FloatType()))

	    """adding new columns to the dataframe"""
            df_square = data_frame.select('*', square_list_udf('x').alias("sq_x"), \
                   square_list_udf('y').alias("sq_y"), square_list_udf('z').alias("sq_z"))

            df_average = df_square.select("*", array_mean("sq_x").alias("avg_x"), \
                  array_mean("sq_y").alias("avg_y"), array_mean("sq_z").alias("avg_z"))

            """calculating the gal value for predicting earthquake and wrting to the data frame"""

            final_df = df_average.select("*", pow(col("avg_x")+ col("avg_y")+ \
                      col("avg_z"), 0.5).alias("gal"))

	    """writing the data to the postgres"""
            try:
                connector = postgres.PostgresConnector(
                    "ec2-18-232-24-132.compute-1.amazonaws.com", "earthquake", "postgres", "nidheesh")
                connector.write(final_df, "Ereadings", "append")

            except Exception as error:
                print(error)
                pass
예제 #7
0
 def write_events_to_db(self, df):
     table = 'airflow_events'
     mode = 'append'
     
     connector = postgres.PostgresConnector()
     connector.write_to_db(df, table, mode)
예제 #8
0
    def write_events_to_db(self, df):
        table = 'safety_score'
        mode = 'append'

        connector = postgres.PostgresConnector()
        connector.write_to_db(df, table, mode)
예제 #9
0
def write_to_postgres(out_df, table=table_name):
    """ function to write output to postgres"""
    table = table_name
    mode = "append"
    connector = postgres.PostgresConnector()
    connector.write(out_df, table, mode)
 def __init__(self):
     self.pgres_connector = postgres.PostgresConnector()
     self.spark = SparkSession \
     .builder \
     .appName("plops_streaming") \
     .getOrCreate()
예제 #11
0
    def write_to_postgres(self, out_df):
        table = "spark_out_hist_occupancy"
        mode = "append"

        connector = postgres.PostgresConnector()
        connector.write(out_df, table, mode)