Exemplo n.º 1
0
            "dim_date": dim_date,
            "dim_weather": dim_weather,
            "dim_location": dim_location,
            "dim_crime": dim_crime,
        }
        return star_tables


if __name__ == "__main__":

    folder_path = "C:\\Users\\SSrih\\OneDrive\\UChicago\\DEP\\Project\\data" \
                  "\\Crime and Weather\\"
    # data_file_name = "CrimeWeather2010.csv"
    # data_file_name = "Crime2010Raw.csv"
    data_file_name = "Crime20161718.csv"

    data_file_path = os.path.join(folder_path, data_file_name)
    data_extractor = DataExtractor()
    data_frame = data_extractor.read_csv(fpath=data_file_path,
                                         nrows_to_read=5000)

    # print(data_frame.head())

    data_worker = DataWorker()

    print(data_frame.isnull().sum().sum())

    data_worker.process_pipeline(data_frame)

    print(data_frame.isnull().sum().sum())
Exemplo n.º 2
0
def main():

    total_start_time = time.time()

    # ------------------------------------------------------------------------ #
    # 0. PARSE INPUT ARGUMENTS
    # ------------------------------------------------------------------------ #

    data_file_name = "Crime_Weather_Cleaned_2017.csv"
    # data_file_name = "Crime20161718.csv"

    data_file_path = os.path.join(FOLDER_PATH, data_file_name)

    # ------------------------------------------------------------------------ #
    # 1. ESTABLISH DATABASE CONNECTION
    # ------------------------------------------------------------------------ #

    print("\n\n\t\t **** 1. DATABASE CONNECTION **** ")

    # host = 'localhost'
    # database = 'crime_star'
    # user = '******'
    # password = '******'
    # port = '3306'

    port = '3306'

    data_loader = DataLoader()

    ret = data_loader.connect(host=DB_IP,
                              database=DB,
                              user=DB_UNAME,
                              password=DB_PWD,
                              port=port)

    if ret != 1:
        print(" Connection not established. Try again")
        print(" Check internet connectivity")
        return ret

    # ------------------------------------------------------------------------ #
    # 2. DATA EXTRACTION PHASE
    # ------------------------------------------------------------------------ #

    print("\n\n\t\t **** 2. DATA EXTRACTION **** ")

    data_extractor = DataExtractor()
    data_frame = data_extractor.read_csv(fpath=data_file_path,
                                         nrows_to_read=-1)

    # ------------------------------------------------------------------------ #
    # 3. DATA LOADING PHASE
    # ------------------------------------------------------------------------ #

    print("\n\n\t\t **** 2. DATA LOADING **** ")

    ret = data_loader.load_full_table(data_frame, table_name=RAW_TABLE_NAME)

    if ret == -1:
        print(" Could not upload to database ")
        data_loader.disconnect()
        return

    print("Successfully populated database")

    # ------------------------------------------------------------------------ #
    # 4. DISCONNECT THE DATABASE AND CLEAN UP MEMORY
    # ------------------------------------------------------------------------ #

    data_loader.disconnect()

    # ------------------------------------------------------------------------ #
    # 5. SEND A MESSAGE TO THE DATA HUB AS AN UPDATE
    # ------------------------------------------------------------------------ #

    print(" Sending message to data hub for update....", end="")

    messenger = Messenger()
    # Connect to the data hub
    messenger.connect(host=DATA_HUB_IP, uname=DATA_HUB_UNAME, pwd=DATA_HUB_PWD)

    # Connect to the exchange
    messenger.connect_to_exchange(ex_name=EX_NAME)

    # Send update
    message = "Database updated with latest rows"
    messenger.send_message_to_exchange(ex_name=EX_NAME,
                                       message=message,
                                       topic=TOPIC)

    print("sent")

    total_end_time = time.time()

    print(" Total time taken :", total_end_time - total_start_time)