def get_dataframe_loadStrategy(df_dict):
    dataframe_loadStatergy = df_dict['targetDataframeDetails'][
        'DFLoadStrategy'].upper()
    miscProcess.log_info(
        SCRIPT_NAME,
        "Dataframe loadStatergy: {}".format(dataframe_loadStatergy))
    return table_loadStatergy
def sourceOccupancyReadParquet(occupancyFilePath, custom_schema,
                               partition_value):

    miscProcess.log_info(SCRIPT_NAME, "Reading Occupancy CSV file...")
    print("Reading Occupancy CSV file")

    source_data_info = {}
    source_data_info["type"] = "CSV"

    #filepath = source_config['sources']['driverSource']["filePath"]
    print("Occupancy file path : {}".format(occupancyFilePath))

    try:
        occupancy = spark.read.format("csv") \
                    .option("header", True) \
                    .schema(custom_schema) \
                    .load(occupancyFilePath)

    except Exception as e:
        miscProcess.log_info(SCRIPT_NAME, "error in reading csv: {}".format(e))

    source_data_info["occupancyFilePath"] = occupancyFilePath
    source_data_info["partition"] = str(partition_value)

    occupancy.show(3)

    return (occupancy, source_data_info)
def partition_column(df_dict):

    part_col_lcase = df_dict['targetDataframeDetails']['dataframePartition']

    miscProcess.log_info(SCRIPT_NAME,
                         "Partition Column : {}".format(part_col_lcase))

    return part_col_lcase
def executeHistoricOccupancyOperations(src_df, output, cols_list, partn_col,
                                       max_retry_count, retry_delay,
                                       custom_schema):

    PartitionColumn = partn_col
    station_id_lookup = createStationIDDF(custom_schema)

    occ_df = src_df\
                .join(station_id_lookup, ['station_id'], how='left_outer')\
                .select(src_df.OccupancyDateTime,src_df.Station_Id,\
                        src_df.Occupied_Spots,src_df.Available_Spots,\
                        station_id_lookup.Longitude,station_id_lookup.Latitude)

    spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

    occ_df = occ_df.withColumn(
        'occupancydatetime',
        timestamp_format(F.col('occupancydatetime'), "MM/dd/yyyy hh:mm:ss a"))

    occ_df = occ_df.withColumn(PartitionColumn,
                               date_format(F.col('occupancydatetime'), "MMMM"))

    ReturnCode = 0
    rec_cnt = 0
    RetryCt = 0
    Success = False

    while (RetryCt < max_retry_count) and not Success:
        try:
            Success = True
            occ_df.write.mode("append").partitionBy(PartitionColumn).parquet(
                output)
        except:
            Success = False
            RetryCt += 1
            if RetryCt == max_retry_count:
                miscProcess.log_info(
                    SCRIPT_NAME,
                    "Failed on writing to Output after {} tries: {} ".format(
                        max_retry_count, output))
                ReturnCode = 2
                return ReturnCode, rec_cnt
            else:
                miscProcess.log_info(
                    SCRIPT_NAME,
                    "Failed on writing to Output, re-try in {} seconds ".
                    format(retry_delay))
                time.sleep(retry_delay)

    miscProcess.log_print("Number of Records Processed: {}".format(rec_cnt))
    return ReturnCode, rec_cnt
def build_dataframe_column_list(df_dict):

    column_list = []

    column_count = len(
        df_dict['targetDataframeDetails']['dataframeColumnInfo'])

    for i in range(0, column_count):
        column_list.append(df_dict['targetDataframeDetails']
                           ['dataframeColumnInfo'][i]['columnName'].lower())

    miscProcess.log_info(SCRIPT_NAME,
                         "Dataframe Column List: {}".format(column_list))
    return column_list
def parse_config(caller_function, filename, option_char='='):
    ReturnCode = 0
    OPTION_CHAR = option_char
    options = {}
    param_list = caller_function + "\n"

    f = open(filename)

    for line in f:
        # Ignore Empty lines
        if not line.strip():
            continue
        # First, remove comments:
        if COMMENT_CHAR in line:
            # if first char is '#' on the line, skip
            strip_line = line.strip()
            if strip_line[0] == '#':
                continue
            # split on comment char, keep on the part before
            line, comment = line.split(COMMENT_CHAR, 1)
            line += '\n'

        # Second, find lines with an option = value
        if OPTION_CHAR in line:
            param_list += '{}'.format(line)
            # spliy on option char
            option, value = line.split(OPTION_CHAR, 1)
            # strip spaces:

            option = option.strip()
            value = value.strip()

            value = remove_whitespace(value)
            options[option] = value

        else:
            miscProcess.log_error(
                SCRIPT_NAME,
                "ERROR: WRONG PARAMETER ASSIGNMENT ON LINE: {}".format(
                    line.strip()), 1)
            ReturnCode = 1
            break

    f.close()
    miscProcess.log_info(SCRIPT_NAME, param_list)
    return options, ReturnCode
예제 #7
0
def sourceBlockfaceReadParquet(blockfacefilePath, cust_schema):

    miscProcess.log_info(SCRIPT_NAME, "Reading CSV file...")
    print("Reading CSV file")

    source_data_info = {}
    source_data_info["type"] = "CSV"

    try:
        blockface = spark.read.format("csv") \
                    .option("header", True) \
                    .schema(cust_schema) \
                    .load(blockfacefilePath)

    except Exception as e:
        miscProcess.log_info(SCRIPT_NAME, "error in reading csv: {}".format(e))

    source_data_info["blockfacefilePath"] = blockfacefilePath

    return (blockface, source_data_info)
def executeOccupancyOperations(src_df, output, datedimoutputpath, cols_list,
                               partn_col, max_retry_count, retry_delay):

    PartitionColumn = partn_col

    ReturnCode = 0
    rec_cnt = 0
    RetryCt = 0
    Success = False

    while (RetryCt < max_retry_count) and not Success:

        try:
            Success = True
            # reading from DBFS
            input_df = src_df

        except:
            Success = False
            RetryCt += 1
            if RetryCt == max_retry_count:
                miscProcess.log_info(
                    SCRIPT_NAME,
                    "Failed on reading input file after {} tries: {}".format(
                        max_retry_count))
                ReturnCode = 1
                return ReturnCode, rec_cnt

            else:
                miscProcess.log_info(
                    SCRIPT_NAME,
                    "Failed on reading input file, re-try in {} seconds ".
                    format(retry_delay))

    select_df = input_df.select(
        [colname for colname in input_df.columns if colname in (cols_list)])

    print("Reading inside transformation function")
    select_df.show(5)

    for column in cols_list:
        if column == 'station_id':
            print("Reading inside column transformations of {}".format(column))
            select_df = select_df.withColumn(
                column, remove_non_word_characters(F.col("station_id")))
            select_df = select_df.withColumn(
                column, select_df[column].cast(IntegerType()))

        elif column == 'occupancydatetime':
            spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
            select_df = select_df.withColumn(
                column, timestamp_format(F.col(column),
                                         "MM/dd/yyyy hh:mm:ss a"))

            select_df = select_df.withColumn(
                PartitionColumn, date_format(F.col(column), "MMMM"))

            date_dim = select_df.withColumn('day_of_week',date_format(F.col(column), "EEEE")) \
                                .withColumn('month',date_format(F.col(column), "MMMM"))

            date_dim = date_dim.select('occupancydatetime', 'day_of_week',
                                       'month')

            select_df = select_df.withColumn(
                PartitionColumn, date_format(F.col(column), "MMMM"))

        elif column == 'location':
            split_col = ['longitude', 'latitude']

            select_df=select_df.withColumn(split_col[0],F.split(column,' ').getItem(1)) \
                        .withColumn(split_col[1],F.split(column,' ').getItem(2))


            select_df=select_df.withColumn(split_col[0],remove__parenthesis(col(split_col[0]))) \
                            .withColumn(split_col[1],remove__parenthesis(col(split_col[1])))



            select_df = select_df.withColumn(split_col[0],select_df[split_col[0]].cast(DoubleType())) \
                                .withColumn(split_col[1],select_df[split_col[1]].cast(DoubleType()))

            select_df = select_df.drop(column)

        #   select_df = select_df.select(cols_list)
        #select_df = select_df.select([colname for colname in input_df.columns if colname in (cols_list)])

        RetryCt = 0
        Success = False

        while (RetryCt < max_retry_count) and not Success:
            try:
                Success = True
                select_df.show(3)
                miscProcess.log_print(
                    "Writing occupancy dataframe to output file: {}".format(
                        output))

                select_df.write.mode("append").partitionBy(
                    PartitionColumn).parquet(output)

                miscProcess.log_print(
                    "Writing date dimension to output file: {}".format(
                        datedimoutputpath))
                date_dim.show(3)
                date_dim.write.mode("append").partitionBy(
                    PartitionColumn).parquet(datedimoutputpath)
            except:
                Success = False
                RetryCt += 1
                if RetryCt == max_retry_count:
                    miscProcess.log_info(
                        SCRIPT_NAME,
                        "Failed on writing to Output after {} tries: {} ".
                        format(max_retry_count, output))
                    ReturnCode = 2
                    return ReturnCode, rec_cnt
                else:
                    miscProcess.log_info(
                        SCRIPT_NAME,
                        "Failed on writing to Output, re-try in {} seconds ".
                        format(retry_delay))
                    time.sleep(retry_delay)

        miscProcess.log_print(
            "Number of Records Processed: {}".format(rec_cnt))
        return ReturnCode, rec_cnt
예제 #9
0
def executeBlockfaceOperations(src_df, output, cols_list, max_retry_count,
                               retry_delay):

    miscProcess.log_print("Starting the Blockface Execute Operations")

    src_df.printSchema()

    ReturnCode = 0
    rec_cnt = 0
    RetryCt = 0
    Success = False

    while (RetryCt < max_retry_count) and not Success:

        try:
            Success = True
            input_df = src_df

        except:
            Success = False
            RetryCt += 1
            if RetryCt == max_retry_count:
                miscProcess.log_info(
                    SCRIPT_NAME,
                    "Failed on reading input file after {} tries: {}".format(
                        max_retry_count))
                ReturnCode = 1
                return ReturnCode, rec_cnt

            else:
                miscProcess.log_info(
                    SCRIPT_NAME,
                    "Failed on reading input file, re-try in {} seconds ".
                    format(retry_delay))

    select_df = input_df.select(
        [colname for colname in input_df.columns if colname in (cols_list)])



    select_df=select_df.withColumn('wkd_start1',format_minstoHHMMSS('wkd_start1')) \
                    .withColumn('wkd_end1',format_minstoHHMMSS('wkd_end1')) \
                    .withColumn('wkd_start2',format_minstoHHMMSS('wkd_start2')) \
                    .withColumn('wkd_end2',format_minstoHHMMSS('wkd_end2')) \
                    .withColumn('wkd_end3',format_minstoHHMMSS('wkd_end3')) \
                    .withColumn('sat_start1',format_minstoHHMMSS('sat_start1')) \
                    .withColumn('sat_end1',format_minstoHHMMSS('sat_end1')) \
                    .withColumn('sat_start2',format_minstoHHMMSS('sat_start2')) \
                    .withColumn('sat_end2',format_minstoHHMMSS('sat_end2')) \
                    .withColumn('sat_start3',format_minstoHHMMSS('sat_start3')) \
                    .withColumn('sat_end3',format_minstoHHMMSS('sat_end3'))

    #miscProcess.log_print("Writing to output file: {}".format(output))

    select_df = select_df.select(
        [colname for colname in input_df.columns if colname in (cols_list)])

    RetryCt = 0
    Success = False

    while (RetryCt < max_retry_count) and not Success:
        try:
            Success = True
            miscProcess.log_info(SCRIPT_NAME, "Writing to Parquet file")
            select_df.show(3)
            print("Output file {}".format(output))
            select_df.coalesce(1).write.mode("overwrite").parquet(
                output + "//Blockface.parquet")
        except:
            Success = False
            RetryCt += 1
            if RetryCt == max_retry_count:
                miscProcess.log_info(
                    SCRIPT_NAME,
                    "Failed on writing File after {} tries: {} ".format(
                        max_retry_count, output))
                ReturnCode = 2
                return ReturnCode, rec_cnt
            else:
                miscProcess.log_info(
                    SCRIPT_NAME,
                    "Failed on writing File, re-try in {} seconds ".format(
                        retry_delay))
                time.sleep(retry_delay)

    miscProcess.log_print("Number of Records Processed: {}".format(rec_cnt))

    return ReturnCode, rec_cnt
miscProcess.log_step(SCRIPT_NAME, "PERFORMING STEP {}:{} ".format(STEP, STEP_DESC))

if SparkSubmitClientMode == 'Y':
    # Spark Submitted in Client Mode
    job_control_file = ControlPath + JOBNAME +".cfg"
    blockface_config_filename = ConfigPath +BlockfaceDataframeName.lower()+'.json'
    occupancy_config_filename = ConfigPath +OccupancyDataframeName.lower()+'.json'
else:
    # Spark Submitted in Cluster Mode
    job_control_file = './' + JOBNAME +".cfg"
    blockface_config_filename = './common/' +BlockfaceDataframeName.lower()+'.json'
    occupancy_config_filename = './common/' +OccupancyDataframeName.lower()+'.json'
    

if os.path.isfile(job_control_file):
    miscProcess.log_info(SCRIPT_NAME, "Job control filename: {} exist".format(job_control_file))
    paramFile, ReturnCode = readEnvironmentParameters.read_job_control(job_control_file)

    if ReturnCode !=0:
        miscProcess.log_error(SCRIPT_NAME, "Error : Reading Job Control file {} ".format(job_control_file),ReturnCode)
        exit(STEP)
    globals().update(paramFile)
else:
    miscProcess.log_error(SCRIPT_NAME, "Job control filename: {} doesn't exist ".format(job_control_file), STEP)
    exit(STEP)


#==============================================================================================================#
(STEP, STEP_DESC)=(20, "Validate All Needed Parameters defined from the control files")
#===============================================================================================================#
# ALWAYS PERFORM THIS STEP
def get_source_dateDimOutputPath(df_dict):
    datedimOutputPath = df_dict['sources']['driverSource']["DimOutputPath"]
    miscProcess.log_info(
        SCRIPT_NAME,
        "Date Dim OutputPathFilePath: {}".format(datedimOutputPath))
    return datedimOutputPath
def get_source_OutputPath(df_dict):
    outputFilePath = df_dict['sources']['driverSource']["OutputPath"]
    miscProcess.log_info(SCRIPT_NAME,
                         "OutputPathFilePath: {}".format(outputFilePath))
    return outputFilePath
def get_source_driverFilerPath(df_dict):
    driverFilePath = df_dict['sources']['driverSource']["filePath"]
    miscProcess.log_info(SCRIPT_NAME,
                         "driverFilePath: {}".format(driverFilePath))
    return driverFilePath