Пример #1
0
def sum( dataDescription, X, headerRow, idColumn, trainingLength, outputColumn):
    if checkForDupes(idColumn, trainingLength):
        printParent('We have found multiple rows with the same ID in them.')
        printParent('We are going to assume that all rows with the same ID in them should be summed up intelligently')
        printParent('This will tranform your dataset from being "long" to being "wide".')
        printParent('If this is not what you intended, please submit an issue and/or a Pull Request explaining your sitaution!')
        # TODO: we now have two different return formats: dictionaries, and lists
        # probably easiest to convert X to dictionaries here regardless of whether it has dupes or not
        # hmmm, imputing missing values would likely be less useful for these cases
            # it's more likely that we would have a separate file entirely for the metadata associated with each row (name, age, gender if our repeated ID is a customerID)
            # let's ignore this for now (MVP!), and then think later about imputing missing values on only the non-joined data, then joining in that data later. that would likely be much more space efficient than joining in that data up front
        return groupByID(dataDescription, X, headerRow, idColumn, trainingLength, outputColumn)
        

    else:
        return [listToDict.all( X, headerRow ), idColumn, trainingLength, outputColumn]

# grab arguments
args = json.loads(sys.argv[1])
trainingFile = args["trainingData"]
testingFile = args["testingData"]
test = args["test"]

# 1. concatenate together the training and testing data sets
# this ensures that whatever transitions we perform in data-formatter will be equally applied to both the training and testing data set
# dataDescription identifies whether each column is "output","id","categorical", or "continuous"
dataDescription, headerRow, trainingLength, X, idColumn, outputColumn, idHeader, problemType, dataDescriptionRaw, hasCustomValidationSplit, validationSplitColumn = concat.inputFiles(
    trainingFile, testingFile
)
if args["verbose"] != 0:
    printParent("finished concatting the training and testing files together")


labelEncoded = False
labelMapping = None
try:
    for val in outputColumn:
        val = float(val)
except:
    labelEncoded = True
    # build a list of all the unique values in outputColumn
    uniqueOutputVals = list(set(outputColumn))
    labelMapping = {}
    for idx, val in enumerate(uniqueOutputVals):
        labelMapping[val] = idx
    for idx, val in enumerate(outputColumn):