def cleanUpAddress(dataName, previousStageNumber, thisStageNumber, qualityFlag,
                   operatorToUse, operationFlag):

    dataFlow, fullPackagePath = openDataFlowPackage(dataName,
                                                    previousStageNumber,
                                                    qualityFlag)

    if dataFlow:

        print('{0}: loaded package from path {1}'.format(
            dataName, fullPackagePath))

        if operationFlag != '':

            # First replace the escape character with comma + space
            dataFlow = dataFlow.str_replace(operationFlag, r'\0d0a', ', ')

            # Now capture any empty addresses
            dataFlow = dataFlow.replace(operationFlag, r', , , ', None)
            print('{0}: cleaned up addresses in column {1}'.format(
                dataName, operationFlag))

            # Will now try to split out the address into component columns...
            # builder = dataFlow.builders.split_column_by_example(source_column=operationFlag)
            # builder.add_example(example=('552 Malvern Avenue, St. George, Bristol', \
            #    ['552 Malvern Avenue', 'St. George', 'Bristol']))
            # builder.add_example(example=('224A Tomswood St, Hainault, Ilford, Cornwall', \
            #     ['224A Tomswood St', 'Hainault', 'Ilford', 'Cornwall']))
            # dataFlow = builder.to_dataflow()

            # dataFlow = dataFlow.split_column_by_delimiters(operationFlag, ', ', True)

        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, 'A')
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None
Пример #2
0
def removeDuplicates(dataName, previousStageNumber, thisStageNumber,
                     qualityFlag, operatorToUse, operationFlag):

    dataFlow, fullPackagePath = openDataFlowPackage(dataName,
                                                    previousStageNumber,
                                                    qualityFlag)

    if dataFlow:

        print('{0}: loaded package from path {1}'.format(
            dataName, fullPackagePath))

        if operationFlag != '':

            columnsToKeep = operationFlag

            numberOfRowsBefore = dataFlow.row_count

            dataFlow = dataFlow.distinct(
                dprep.ColumnSelector(columnsToKeep, True, True, invert=False))
            print(
                '{0}: removed duplicates from column {1} rows before {2} rows afer {3}'
                .format(dataName, operationFlag, numberOfRowsBefore,
                        dataFlow.row_count))

        else:
            print('{0}: no duplicate processing required'.format(dataName))

        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, qualityFlag)
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None
def automaticallyDetectColumnTypes(dataName, previousStageNumber,
                                   thisStageNumber, qualityFlag, operatorToUse,
                                   operationFlag):
    dataFlow, fullPackagePath = openDataFlowPackage(dataName,
                                                    previousStageNumber,
                                                    qualityFlag)

    if dataFlow:

        print('{0}: loaded package from path {1}'.format(
            dataName, fullPackagePath))

        # Now perform the operation on the dataFlow : ie remove the number of rows specified from the top

        # Perform the operation
        if operationFlag == 'Yes':
            # Detect and apply column types
            builder = dataFlow.builders.set_column_types()
            builder.learn()
            print('{0}: candidates detected {1}'.format(
                dataName, builder.conversion_candidates))
            builder.ambiguous_date_conversions_keep_month_day()
            dataFlow = builder.to_dataflow()

        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, 'A')
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None
def parseNulls(dataName, previousStageNumber, thisStageNumber, qualityFlag,
               operatorToUse, operationFlag):

    dataFlow, fullPackagePath = openDataFlowPackage(dataName,
                                                    previousStageNumber,
                                                    qualityFlag)

    if dataFlow:

        print('{0}: loaded package from path {1}'.format(
            dataName, fullPackagePath))

        if operationFlag != '':
            # Get a list of the columns and count them...
            dataFlowColumns = list(dataFlow.get_profile().columns.keys())
            # Replace any occurences null, using custom, across all columns...
            dataFlow = dataFlow.replace_na(dataFlowColumns,
                                           custom_na_list=operationFlag)
            print(
                '{0}: parsed nulls including custom string {1} from {2} columns'
                .format(dataName, operationFlag, len(dataFlowColumns)))

        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, 'A')
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None
def removeRowsFromTop(dataName, previousStageNumber, thisStageNumber,
                      qualityFlag, operatorToUse, operationFlag):

    dataFlow, fullPackagePath = openDataFlowPackage(dataName,
                                                    previousStageNumber,
                                                    qualityFlag)

    if dataFlow:

        print('{0}: loaded package from path {1}'.format(
            dataName, fullPackagePath))

        # Now perform the operation on the dataFlow : ie remove the number of rows specified from the top
        numberOfRowsToRemove = int(operationFlag)
        dataFlow = dataFlow.skip(numberOfRowsToRemove)
        print('{0}: removed first {1} row(s)'.format(dataName,
                                                     numberOfRowsToRemove))

        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, qualityFlag)
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        # Now return all of the components badk to the main loop...
        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None
Пример #6
0
def renameColumns(dataName, previousStageNumber, thisStageNumber, qualityFlag,
                  operatorToUse, operationFlag):

    dataFlow, fullPackagePath = openDataFlowPackage(dataName,
                                                    previousStageNumber,
                                                    qualityFlag)

    if dataFlow:

        print('{0}: loaded package from path {1}'.format(
            dataName, fullPackagePath))

        if operationFlag != '':
            # Do the operation on columns to rename them...
            print('{0}: renamed {1} columns'.format(dataName, operationFlag))
        else:
            print('{0}: no operation to perform'.format(dataName))

        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, 'A')
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None
def mapLookups(dataName, previousStageNumber, thisStageNumber, qualityFlag,
               operatorToUse, operationFlag):

    dataFlow, fullPackagePath = openDataFlowPackage(dataName,
                                                    previousStageNumber,
                                                    qualityFlag)

    if dataFlow:

        print('{0}: loaded package from path {1}'.format(
            dataName, fullPackagePath))

        if operationFlag != '':

            transforms = load_transformation_configuration('./Config/' +
                                                           operationFlag)

            if len(transforms) > 1:
                lookups = get_lookups_from_transforms(transforms)

                for key in lookups:
                    lookupDictionary = lookups[key]
                    replacements = []

                    dataFlow = dataFlow.set_column_types(
                        {key: dprep.FieldType.STRING})

                    for lookup in lookupDictionary:

                        replacements.append(
                            ReplacementsValue(lookup,
                                              lookupDictionary[lookup]))

                    destination_column = get_destination_column_name(
                        key, transforms)
                    dataFlow = dataFlow.map_column(key, destination_column,
                                                   replacements)
                    print(dataName + ': Transformed lookups for column - ' +
                          key + '. Added new column ' + destination_column)

        else:
            print('{0}: no look-up processing required'.format(dataName))

        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, qualityFlag)
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None
def createUPMDataflow(dataName, previousStageNumber, thisStageNumber,
                      qualityFlag, operatorToUse, operationFlag):

    dataFlow, fullPackagePath = openDataFlowPackage(dataName,
                                                    previousStageNumber,
                                                    qualityFlag)

    if dataFlow:

        print('{0}: loaded package from path {1}'.format(
            dataName, fullPackagePath))

        if operationFlag != '':

            mappingConfig = dprep.read_csv(
                './Config/' + operationFlag).to_pandas_dataframe()

            targetDataFlow = dataFlow
            columnsToKeep = ''

            for sourceTable in mappingConfig[mappingConfig.SourceTable ==
                                             dataName]['SourceTable'].unique():
                for sourceColumn, targetColumn in mappingConfig[
                        mappingConfig.SourceTable == sourceTable][[
                            'SourceColumn', 'TargetColumn'
                        ]].values:
                    if columnsToKeep is '':
                        columnsToKeep = targetColumn
                    else:
                        columnsToKeep = columnsToKeep + '|' + targetColumn

                    targetDataFlow = targetDataFlow.rename_columns(
                        {sourceColumn: targetColumn})

            targetDataFlow = targetDataFlow.drop_columns(
                dprep.ColumnSelector(columnsToKeep, True, True, invert=True))
            newPackageName = next(
                iter(mappingConfig[mappingConfig.SourceTable == dataName]
                     ['TargetTable'].unique()))

            createNewPackageDirectory(newPackageName)
            saveDataFlowPackage(targetDataFlow, newPackageName,
                                thisStageNumber, 'A')

        else:
            print('{0}: no duplicate processing required'.format(dataName))

        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, qualityFlag)
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None
Пример #9
0
    if operationFlag == 'Yes':
        print('{0}: loading data from file path {1}'.format(
            dataName, fullFilePath))
        newDataFlow = dprep.read_csv(fullFilePath)

        dataProfile = newDataFlow.get_profile()

        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        columnInventoryAll = columnInventoryAll.append(columnInventory)
        print('{0}: generated column inventory'.format(dataName))

        dataFlowInventory = getDataFlowStats(newDataFlow, dataProfile,
                                             dataName, thisStageNumber,
                                             operatorToUse, operationFlag)
        dataFlowInventoryAll = dataFlowInventoryAll.append(dataFlowInventory)
        print('{0}: generated data flow inventory'.format(dataName))

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(newDataFlow, dataName,
                                                thisStageNumber, 'A')
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))
    else:
        print('{0}: no package file created.'.format(dataName))

    # Once we have processed all dataflows, we save the inventories away
    saveColumnInventory(columnInventoryAll, thisStageNumber)
    saveDataFlowInventory(dataFlowInventoryAll, thisStageNumber)
Пример #10
0
def splitTableBasedOnSingleColumn(dataName, previousStageNumber,
                                  thisStageNumber, qualityFlag, operatorToUse,
                                  operationFlag):

    dataFlow, fullPackagePath = openDataFlowPackage(dataName,
                                                    previousStageNumber,
                                                    qualityFlag)

    if dataFlow:

        print('{0}: loaded package from path {1}'.format(
            dataName, fullPackagePath))

        dataProfile = dataFlow.get_profile()

        # Set up empty intermediate dataframes that we will use to build up inventories at both dataFlow and column level
        dataFlowInventoryIntermediate = pd.DataFrame()
        columnInventoryIntermediate = pd.DataFrame()

        if operationFlag != '':

            # First, grab the unique set of values in the column
            valuesInColumn = dataProfile.columns[operationFlag].value_counts

            # Now filter the original data flow based on each unique value in turn and fork a new data flow!
            for valueToSplitOn in valuesInColumn:

                newDataFlow = dataFlow.filter(
                    dataFlow[operationFlag] == valueToSplitOn.value)

                # Create a new name for this data flow based on concatenation of source dataflow, column name and value used for filter
                newDataName = dataName + '_' + operationFlag + '_' + valueToSplitOn.value

                newDataProfile = newDataFlow.get_profile()

                # Now generate column and data flow inventories
                columnInventory = getColumnStats(newDataProfile, newDataName,
                                                 thisStageNumber,
                                                 operatorToUse, operationFlag)
                dataFlowInventory = getDataFlowStats(
                    newDataFlow, newDataProfile, newDataName, thisStageNumber,
                    operatorToUse, operationFlag)

                # Capture the column inventory for the new dataflow
                columnInventoryIntermediate = columnInventoryIntermediate.append(
                    columnInventory)

                # Capture the data flow inventory for the new data flow
                dataFlowInventoryIntermediate = dataFlowInventoryIntermediate.append(
                    dataFlowInventory)

                # Finally save the data flow so it can be passed onto the next stage of the process...
                targetPackagePath = saveDataFlowPackage(
                    newDataFlow, newDataName, thisStageNumber, qualityFlag)
                print('{0}: saved package to {1}'.format(
                    newDataName, targetPackagePath))

        else:
            print('{0}: no operation required'.format(dataName))

        # Generate column and data flow inventories for the source dataflow
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        columnInventoryIntermediate = columnInventoryIntermediate.append(
            columnInventory)

        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)
        dataFlowInventoryIntermediate = dataFlowInventoryIntermediate.append(
            dataFlowInventory)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, qualityFlag)
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        # Now return all of the components badk to the main loop...
        return dataFlow, columnInventoryIntermediate, dataFlowInventoryIntermediate

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None
def quarantineRows(dataName, previousStageNumber, thisStageNumber, qualityFlag,
                   operatorToUse, operationFlag):

    dataFlow, fullPackagePath = openDataFlowPackage(dataName,
                                                    previousStageNumber,
                                                    qualityFlag)

    if dataFlow:

        print('{0}: loaded package from path {1}'.format(
            dataName, fullPackagePath))

        # Now perform the operation on the dataFlow : ie remove the number of rows specified from the top

        # First count the number of columns found
        dataFlowColumns = list(dataFlow.get_profile().columns.keys())
        numberOfColumnsFound = len(dataFlowColumns)

        # Now convert the operationFlag to an integer
        headerCount = int(operationFlag)

        # If we have more columns that expected, we quarantine rows which have values in the extra columns
        if numberOfColumnsFound > headerCount:
            # NOTE - this logic assumes that all unwanted columns are on the far right, this could be improved!
            # Fork a new data flow with rows that have data in the un-expected columns
            print(
                '{0}: we have found {1} columns, expected {2} so will now quarantine any rows with data in them '
                .format(dataName, numberOfColumnsFound, headerCount))
            quarantinedDataFlow = dataFlow.drop_nulls(
                dataFlowColumns[headerCount:])

            # There is a chance we have an extra column but no rows to quarantine, so check this first
            if quarantinedDataFlow.row_count is None:
                quarantinedRowCount = 0
                print('{0}: no rows to quarantine'.format(dataName))
            else:
                quarantinedRowCount = quarantinedDataFlow.row_count
                # Finally save the data flow so it can be used later
                fullPackagePath = saveDataFlowPackage(quarantinedDataFlow,
                                                      dataName,
                                                      thisStageNumber, 'B')
                print('{0}: quarantined {1} rows of data to {2}'.format(
                    dataName, quarantinedRowCount, fullPackagePath))

            # Now filter out the quarantined rows from the main data set
            # NOTE : can't figure out a better way of doign this for now - see note below...
            for columnToCheck in dataFlowColumns[headerCount:]:
                # NOTE - don't know why commented line of code below doesn't work!
                # dataFlow = dataFlow.filter(dataFlow[columnToCheck] != '')
                dataFlow = dataFlow.assert_value(columnToCheck,
                                                 value != '',
                                                 error_code='ShouldBeNone')
                dataFlow = dataFlow.filter(col(columnToCheck).is_error())
                print('{0}: filtered column {1}, row count now {2}'.format(
                    dataName, columnToCheck, dataFlow.row_count))

            # Finally drop the extra columns
            dataFlow = dataFlow.drop_columns(dataFlowColumns[headerCount:])
            print('{0}: dropped {1} unwanted columns'.format(
                dataName, len(dataFlowColumns[headerCount:])))
        else:
            print(
                '{0}: we have found {1} columns, expected {2} so not going to do anything'
                .format(dataName, numberOfColumnsFound, headerCount))

        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, qualityFlag)
        print('{0}: saved package to {1}'.format(dataName, targetPackagePath))

        # Now return all of the components badk to the main loop...
        return dataFlow, columnInventory, dataFlowInventory

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None
Пример #12
0
def joinTables(dataName, previousStageNumber, thisStageNumber, qualityFlag,
               operatorToUse, operationFlag):

    dataFlow, fullPackagePath = openDataFlowPackage(dataName,
                                                    previousStageNumber,
                                                    qualityFlag)

    if dataFlow:

        print('{0}: loaded package from path {1}'.format(
            dataName, fullPackagePath))

        # Set up empty intermediate dataframes that we will use to build up inventories at both dataFlow and column level
        dataFlowInventoryIntermediate = pd.DataFrame()
        columnInventoryIntermediate = pd.DataFrame()

        if operationFlag != '':

            # Load config file
            joinConfig = dprep.read_csv('./Config/' +
                                        operationFlag).to_pandas_dataframe()

            # For each config in the file...
            for index, row in joinConfig.iterrows():

                leftDataName = row['LeftDataName']
                leftDataFlowJoinColumn = row['LeftDataFlowJoinColumn']
                rightDataName = row['RightDataName']
                rightDataFlowJoinColumn = row['RightDataFlowJoinColumn']
                joinType = row['JoinType']
                print(
                    '{0}: ready to join {1} {2} -> {3} {4} using jointype {5}'.
                    format(dataName, leftDataName, leftDataFlowJoinColumn,
                           rightDataName, rightDataFlowJoinColumn, joinType))

                # Load right hand data flow
                rightDataFlow, fullPackagePath = openDataFlowPackage(
                    rightDataName, previousStageNumber, qualityFlag)
                print('{0}: loaded package from path {1}'.format(
                    rightDataName, fullPackagePath))

                # We always perform the inner "MATCH" stype join
                join_builder = dataFlow.builders.join(
                    right_dataflow=rightDataFlow,
                    left_column_prefix=dataName + '_',
                    right_column_prefix=rightDataName + '_')
                join_builder.detect_column_info()
                join_builder.join_key_pairs = [(leftDataFlowJoinColumn,
                                                rightDataFlowJoinColumn)]
                # Setting up join type:
                # NONE = 0
                # MATCH = 2
                # UNMATCHLEFT = 4
                # UNMATCHRIGHT = 8
                join_builder.join_type = 2
                innerDataFlow = join_builder.to_dataflow()
                print('{0} created inner dataflow : Columns : {1}, Rows : {2}'.
                      format(dataName,
                             len(innerDataFlow.get_profile().columns),
                             innerDataFlow.row_count))

                if joinType == "LEFT":
                    # Use the "UNMATCHLEFT" setting to grab the rows that haven't been joined from the left data flow
                    join_builder.join_type = 4
                    leftUnmatchedDataFlow = join_builder.to_dataflow()
                    print(
                        '{0} created left unmatched dataflow : Columns : {1}, Rows : {2}'
                        .format(
                            dataName,
                            len(leftUnmatchedDataFlow.get_profile().columns),
                            leftUnmatchedDataFlow.row_count))

                    # Now append this dataflow to the original inner join dataflow, to create a "left outer join"
                    newDataFlow = innerDataFlow.append_rows(
                        [leftUnmatchedDataFlow])
                else:
                    newDataFlow = innerDataFlow

                # Create a new name for this data flow based on concatenation of left dataflow and right
                newDataName = dataName + '_' + rightDataName

                # Output key stats
                print('{0} left table : {0}, Columns : {1}, Rows : {2}'.format(
                    leftDataName, len(dataFlow.get_profile().columns),
                    dataFlow.row_count))
                print(
                    '{0} right table : {0}, Columns : {1}, Rows : {2}'.format(
                        rightDataName,
                        len(rightDataFlow.get_profile().columns),
                        rightDataFlow.row_count))

                newDataProfile = newDataFlow.get_profile()

                print(
                    '{0} joined table : {0}, Columns : {1}, Rows : {2}'.format(
                        newDataName, len(newDataProfile.columns),
                        newDataFlow.row_count))

                # Now generate column and data flow inventories
                columnInventory = getColumnStats(newDataProfile, newDataName,
                                                 thisStageNumber,
                                                 operatorToUse, operationFlag)
                dataFlowInventory = getDataFlowStats(
                    newDataFlow, newDataProfile, newDataName, thisStageNumber,
                    operatorToUse, operationFlag)

                # Capture the column inventory for the new dataflow
                columnInventoryIntermediate = columnInventoryIntermediate.append(
                    columnInventory)

                # Capture the data flow inventory for the new data flow
                dataFlowInventoryIntermediate = dataFlowInventoryIntermediate.append(
                    dataFlowInventory)

                # Finally save the data flow so it can be passed onto the next stage of the process...
                targetPackagePath = saveDataFlowPackage(
                    newDataFlow, newDataName, thisStageNumber, 'A')
                print('{0}: saved package to {1}'.format(
                    newDataName, targetPackagePath))

        else:
            print('{0}: no joining of tables required'.format(dataName))

        dataProfile = dataFlow.get_profile()

        # Now generate column and data flow inventories
        columnInventory = getColumnStats(dataProfile, dataName,
                                         thisStageNumber, operatorToUse,
                                         operationFlag)
        columnInventoryIntermediate = columnInventoryIntermediate.append(
            columnInventory)

        dataFlowInventory = getDataFlowStats(dataFlow, dataProfile, dataName,
                                             thisStageNumber, operatorToUse,
                                             operationFlag)
        dataFlowInventoryIntermediate = dataFlowInventoryIntermediate.append(
            dataFlowInventory)

        # Finally save the data flow so it can be passed onto the next stage of the process...
        targetPackagePath = saveDataFlowPackage(dataFlow, dataName,
                                                thisStageNumber, qualityFlag)
        print('{0}: saved source package to {1}'.format(
            dataName, targetPackagePath))

        return dataFlow, columnInventoryIntermediate, dataFlowInventoryIntermediate

    else:
        print('{0}: no package file found at location {1}'.format(
            dataName, fullPackagePath))
        return None, None, None