dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, qualityFlag) print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None #%% dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber, thisStageNumber, 'A', 'RemoveDuplicates', removeDuplicates) #%% dataFlowInventoryAll
# Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, 'A') print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None #%% dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber, thisStageNumber, 'A', 'AutoDetectTypes', automaticallyDetectColumnTypes) #%% dataFlowInventoryAll
dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, qualityFlag) print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None #%% dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber, thisStageNumber, 'A', 'UPMDataFlowMapping', createUPMDataflow) #%% dataFlowInventoryAll
dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, qualityFlag) print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None #%% dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber, thisStageNumber, 'A', 'MapLookups', mapLookups) #%% dataFlowInventoryAll
dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, 'A') print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None #%% dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber, thisStageNumber, 'A', 'CleanUpAddress', cleanUpAddress) #%% dataFlowInventoryAll
dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, 'A') print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None #%% dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber, thisStageNumber, 'A', 'RenameColumns', renameColumns) #%% dataFlowInventoryAll
dataProfile = dataFlow.get_profile() # Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, 'A') print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None #%% dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber, thisStageNumber, 'A', 'ParseNullString', parseNulls) #%% dataFlowInventoryAll
#%% [markdown] # # Join Tables - Pass 2 # To join two tables: # - Define left and right dataflows # - Define left and right join keys # NOTE - future enhancement would be to quarantine orphaned rows from left and right dataflows #%% # Import all of the libraries we need to use... import pandas as pd import azureml.dataprep as dprep import os as os import re as re import collections from azureml.dataprep import col from azureml.dataprep import Dataflow from commonDataFlowProcessingLoop import dataFlowProcessingLoop from commonInventoryCreation import getColumnStats, getDataFlowStats from commonPackageHandling import openDataFlowPackage, saveDataFlowPackage from commonJoinHandling import joinTables # Let's also set up global variables and common functions... previousStageNumber = '40' thisStageNumber = '41' #%% dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber, thisStageNumber, 'A', 'JoinTablesPass2', joinTables) #%% dataFlowInventoryAll
columnInventoryIntermediate = columnInventoryIntermediate.append( columnInventory) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventoryIntermediate = dataFlowInventoryIntermediate.append( dataFlowInventory) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, qualityFlag) print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) # Now return all of the components badk to the main loop... return dataFlow, columnInventoryIntermediate, dataFlowInventoryIntermediate else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None #%% dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber, thisStageNumber, 'A', 'SplitTable', splitTableBasedOnSingleColumn) #%% dataFlowInventoryAll
# Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, qualityFlag) print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) # Now return all of the components badk to the main loop... return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None #%% dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber, thisStageNumber, 'A', 'QuarantineExtraColumns', quarantineRows) #%% dataFlowInventoryAll
# Now generate column and data flow inventories columnInventory = getColumnStats(dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName, thisStageNumber, operatorToUse, operationFlag) # Finally save the data flow so it can be passed onto the next stage of the process... targetPackagePath = saveDataFlowPackage(dataFlow, dataName, thisStageNumber, qualityFlag) print('{0}: saved package to {1}'.format(dataName, targetPackagePath)) # Now return all of the components badk to the main loop... return dataFlow, columnInventory, dataFlowInventory else: print('{0}: no package file found at location {1}'.format( dataName, fullPackagePath)) return None, None, None #%% dataFlowInventoryAll = dataFlowProcessingLoop(previousStageNumber, thisStageNumber, 'A', 'RemoveRowsFromTop', removeRowsFromTop) #%% dataFlowInventoryAll