def joinTrainValid(trainData, validData): info("Joining train and validation data", ind=6) validVal = Series(repeat(0, trainData.shape[0])) trainData = trainData.assign(isValid=validVal.values) info("Train data has size " + getDim(trainData), ind=6) validVal = Series(repeat(1, validData.shape[0])) validData = validData.assign(isValid=validVal.values) info("Validation data has size " + getDim(validData), ind=6) pddf = trainData.append(validData) info("Combined data has size " + getDim(pddf), ind=6) return pddf
def joinTrainTest(trainData, testData): info("Joining train and test data", ind=6) trainVal = Series(repeat(1, trainData.shape[0])) trainData = trainData.assign(isTrain=trainVal.values) info("Train data has size " + getDim(trainData), ind=6) testVal = Series(repeat(0, testData.shape[0])) testData = testData.assign(isTrain=testVal.values) info("Test data has size " + getDim(testData), ind=6) pddf = trainData.append(testData) info("Combined data has size " + getDim(pddf), ind=6) return pddf
def getTrainData(config): X_trainName, X_testName, X_validName, y_trainName, y_testName, y_validName = getTrainTestNames( config) if isFile(X_trainName) and isFile(y_trainName): info("Loading {0}".format(X_trainName), ind=4) X_train = getJoblib(X_trainName) info("Found data that is {0}".format(getDim(X_train)), ind=4) info("Loading {0}".format(y_trainName), ind=4) y_train = getJoblib(y_trainName) info("Found data that is {0}".format(getDim(y_train)), ind=4) return X_train, y_train else: error("Train data is not ready") return None
def testModel(modelname, estimator, X_test, config): info("Testing a {0} estimator".format(modelname), ind=0) info("X data is {0}".format(getDim(X_test)), ind=2) problemType = config['problem'] results = {"good": True, "label": None, "prob": None, "pred": None} if isinstance(estimator, dict): estimator = estimator['estimator'] if estimator is None: error("The {0} estimator is NULL".format(modelname)) results['good'] = False return results if isClassification(problemType): info("Predicting classification labels/classes for {0}".format(modelname), ind=4) try: results['label'] = estimator.predict(X_test) except: results['good'] = False error("There is a problem getting labels for {0}".format(modelname), ind=4) info("Predicting classification probabilities for {0}".format(modelname), ind=4) try: proba = estimator.predict_proba(X_test) results['prob'] = proba[:,1] except: results['good'] = False error("There is a problem getting probabilities for {0}".format(modelname), ind=4) if isRegression(problemType): info("Predicting regression score/output for {0}".format(modelname), ind=4) try: results['pred'] = estimator.predict(X_test) except: results['good'] = False error("There is a problem getting prediction for {0}".format(modelname), ind=4) if results['good'] == True: info("Everything looks good for the {0} estimator".format(modelname), ind=4) else: info("There is a problem with the {0} estimator".format(modelname), ind=4) return results
def readData(config): info("Getting data for analysis") ## Get name name = config['name'] ## Load the data we need if name == "uptake": pddf = readUptake(config) elif name == "kdd99": pddf = readKDD99(config) elif name in ["boston", "diabetes", "wine", "digits", "cancer"]: pddf = readDataset(config, name) elif name == "regression": pddf = makeRegression(config) elif name == "classification": pddf = makeClassification(config) else: raise ValueError("Name", name, "not recognized in readData()") info("Using data that is " + getDim(pddf), ind=0) return pddf
def getEncodedData(pddata): info('Convert Categorical Data To Integer', ind=4) ## label encode data labelEncoders, results = getLabelEncoders(pddata) ## create data frame of categorical features encodedCatData = pd.DataFrame({cat_colname: encoded for cat_colname, label_encoder, encoded in results}) ## drop columns info('Dropping original '+getNcols(encodedCatData, asStr=True)+' columns', ind=6) dropEncodedColumns(pddata, encodedCatData.columns) info('Original data is now '+getDim(pddata), ind=6) ## join to original data #info('Joining encoded data', ind=6) #encodedData = encodedCatData.join(pddata) return pddata, encodedCatData, labelEncoders
def loadTrainTestData(config): X_trainName, X_testName, X_validName, y_trainName, y_testName, y_validName = getTrainTestNames( config) if all([ isFile(X_trainName), isFile(X_testName), isFile(X_validName), isFile(y_trainName), isFile(y_testName), isFile(y_validName) ]): info("Loading saved final train/test datasets.", ind=2) info("Loading {0}".format(X_trainName), ind=4) X_train = getJoblib(X_trainName) info("Found data that is {0}".format(getDim(X_train)), ind=4) info("Loading {0}".format(X_testName), ind=4) X_test = getJoblib(X_testName) info("Found data that is {0}".format(getDim(X_test)), ind=4) info("Loading {0}".format(X_validName), ind=4) X_valid = getJoblib(X_validName) info("Found data that is {0}".format(getDim(X_valid)), ind=4) info("Loading {0}".format(y_trainName), ind=4) y_train = getJoblib(y_trainName) info("Found data that is {0}".format(getDim(y_train)), ind=4) info("Loading {0}".format(y_testName), ind=4) y_test = getJoblib(y_testName) info("Found data that is {0}".format(getDim(y_test)), ind=4) info("Loading {0}".format(y_validName), ind=4) y_valid = getJoblib(y_validName) info("Found data that is {0}".format(getDim(y_valid)), ind=4) return X_train, X_test, X_valid, y_train, y_test, y_valid else: error("Train/test datasets are not ready!")
def getTrainTestData(pddf, config): info("Creating final train/test datasets.", ind=0) ## Config info targetConfig = config['target'] targetcol = targetConfig['colname'] outputConfig = config['output'] compress = outputConfig['compress'] if not isColumn(pddf, targetcol): raise ValueError("Target column", targetcol, "is not included in data!") ## Determine if the data showed up split (seperate train/test files) isSplit = False isValid = False if isColumn(pddf, "isTrain"): info("Data is already split", ind=2) isSplit = True elif isColumn(pddf, "isValid"): info("Validation data is ready, but train/test data must be created", ind=2) isValid = True else: info("Train/test data must be created", ind=2) ## Create data if it's split if isSplit: info("Splitting train data", ind=2) X_train = pddf[pddf['isTrain'] == 1] y_train = X_train[targetcol] X_train.drop(labels=[targetcol, 'isTrain'], axis=1, inplace=True) info("Splitting test data", ind=2) X_test = pddf[pddf['isTrain'] == 0] y_test = X_test[targetcol] X_test.drop(labels=[targetcol, 'isTrain'], axis=1, inplace=True) X_valid = None y_valid = None elif isValid: info("Splitting validation data", ind=2) X_valid = pddf[pddf['isValid'] == 1] y_valid = X_valid[targetcol] info("Creating train/test data that contains validated data", ind=2) X_data = pddf[pddf['isValid'] == 0] y = X_data[targetcol] X_train, X_test, y_train, y_test = train_test_split(X_data, y, test_size=0.2) else: info("Creating train/test data that is not already split or validated", ind=2) y = pddf[targetcol] pddf.drop(labels=[targetcol], axis=1, inplace=True) X_train, X_test, y_train, y_test = train_test_split(pddf, y, test_size=0.2) X_valid = None y_valid = None if isSplit: info("Dropping {0} from DataFrame".format(", ".join( [targetcol, 'isTrain']))) pddf.drop(labels=[targetcol, 'isTrain'], axis=1, inplace=True) elif isValid: info("Dropping {0} from DataFrame".format(", ".join( [targetcol, 'isValid']))) pddf.drop(labels=[targetcol, 'isValid'], axis=1, inplace=True) X_trainName, X_testName, X_validName, y_trainName, y_testName, y_validName = getTrainTestNames( config) info("Saving {0} data to {1}".format(getDim(X_train), X_trainName), ind=4) saveJoblib(X_trainName, X_train, compress) info("Saving {0} data to {1}".format(getDim(X_test), X_testName), ind=4) saveJoblib(X_testName, X_test, compress) info("Saving {0} data to {1}".format(getDim(X_valid), X_validName), ind=4) saveJoblib(X_validName, X_valid, compress) info("Saving {0} data to {1}".format(getDim(y_train), y_trainName), ind=4) saveJoblib(y_trainName, y_train, compress) info("Saving {0} data to {1}".format(getDim(y_test), y_testName), ind=4) saveJoblib(y_testName, y_test, compress) info("Saving {0} data to {1}".format(getDim(y_valid), y_validName), ind=4) saveJoblib(y_validName, y_valid, compress) return X_train, X_test, X_valid, y_train, y_test, y_valid
def formatData(trainData, testData, config): info('Formatting training data of size ' + getDim(trainData), ind=0) info('Formatting testing data of size ' + getDim(testData), ind=0) ## Config info targetConfig = config['target'] targetcol = targetConfig['colname'] positiveTarget = targetConfig['positive'] targetNAstrategy = targetConfig['NAstrategy'] featureConfig = config['feature'] featureNAstrategy = featureConfig['NAstrategy'] if not isColumn(trainData, targetcol): raise ValueError("Target column", targetcol, "is not a valid column.") # 1) Get problem type targetData = trainData[targetcol] if config.get('problem'): problemType = config['problem'] else: problemType = getProblemType(targetData) config['problem'] = problemType # 2) format target based on what we want info('Formatting target', ind=1) if isClassification(problemType): convertToBinaryInt(trainData, targetcol, positiveTarget) if isColumn(testData, targetcol): convertToBinaryInt(testData, targetcol, positiveTarget) if isRegression(problemType): info('Not formatting target since it is regression', ind=1) # 3) replace NA info('Replace NA in data', ind=1) print featureNAstrategy replaceTargetNA(trainData, targetcol, targetNAstrategy) replaceFeatureNA(trainData, targetcol, featureNAstrategy) if isColumn(testData, targetcol): replaceTargetNA(testData, targetcol, targetNAstrategy) replaceFeatureNA(testData, targetcol, featureNAstrategy) # 4) drop columns we don't need dropData(trainData, config) dropData(testData, config) return trainData, testData # 5) format remaining data to numeric info('Formatting features to numeric', ind=1) convertCategoricalToNumeric(trainData, targetcol) convertCategoricalToNumeric(testData, targetcol) info('Post formatting the training data is now ' + getDim(trainData), ind=2) info('Post formatting the testing data is now ' + getDim(trainData), ind=2) #pddata.drop([colname], axis = 1, inplace = True) #pddata = pddata.join(expData) # 5) replace low variance info('Remove low variance features in data', ind=1) info('Finished formatting data', ind=0) return pddata
def formatData(pddf, config): info('Formatting data of size ' + getDim(pddf), ind=0) ## Config info targetConfig = config['target'] targetcol = targetConfig['colname'] positiveTarget = targetConfig['positive'] targetNAstrategy = targetConfig['NAstrategy'] featureConfig = config['feature'] featureNAstrategy = featureConfig['NAstrategy'] if not isColumn(pddf, targetcol): raise ValueError("Target column", targetcol, "is not a valid column.") # 1) Get problem type targetData = pddf[targetcol] if config.get('problem'): problemType = config['problem'] else: problemType = getProblemType(targetData) config['problem'] = problemType # 2) format target based on what we want info('Formatting target', ind=2) if isClassification(problemType): convertToBinaryInt(pddf, targetcol, positiveTarget) if isRegression(problemType): info('Not formatting target since it is regression', ind=1) # 3) replace NA info('Replace NA in data', ind=2) replaceTargetNA(pddf, targetcol, targetNAstrategy) replaceFeatureNA(pddf, targetcol, featureNAstrategy) # 4) remove low variance data info('Remove low variance in data', ind=2) # 5) drop columns we don't need info('Analyze data for possible drops', ind=2) analyzeColumns(pddf, config) dropData(pddf, config) info('Post column data the data is now ' + getDim(pddf), ind=2) # 6) label and one-hot encode data info('Label encode training data to numeric', ind=2) pddf, encodedCatData, labelEncoders = getEncodedData(pddf) info('Hot encode training data to sparse data frame', ind=1) encodedData = getHotEncodedData(encodedCatData, labelEncoders) info('Join training data together', ind=2) pddf = pddf.join(encodedData) info('Post formatting the data is now ' + getDim(pddf), ind=2) # 7) replace low variance info('Remove low variance features in data', ind=2) if isClassification(problemType): info('Classification is To do!', ind=4) if isRegression(problemType): info('Not removing any features since it is regression', ind=1) # 8) replace NA (if any remain) info('Replace NA (if any) in data', ind=2) replaceTargetNA(pddf, targetcol, targetNAstrategy) replaceFeatureNA(pddf, targetcol, featureNAstrategy) if sum(pddf.isnull().any()) > 0: error("There are still NA entries in the dataset!", ind=4) info('Finished formatting data. Data is now ' + getDim(pddf), ind=2) return pddf
def trainModel(modelname, X_train, y_train, config): info("Training a {0} estimator".format(modelname), ind=0) info("X data is {0}".format(getDim(X_train)), ind=2) info("y data is {0}".format(getDim(y_train)), ind=2) problemType = config['problem'] info("This is a {0} problem".format(problemType), ind=2) modelData = getModelData(config, modelname) tuneForParams = True refitModel = False goodModel = True if modelData is not None: if modelData.get('tune') is False: tuneForParams = False if modelData.get('fit') is True: tuneForParams = False if modelData.get('cv') is True: tuneForParams = False if modelData.get('refit') is True: refitModel = True if modelData.get('error') is True: goodModel = False else: info("No model parameters were given. Using default {0} estimator".format(modelname), ind=4) tuneForParams = False if goodModel is False: error("Model {0} is no good and will not run it.".format(modelname)) return None ################################################################# # Get Model ################################################################# retval = getModel(config, modelname) ################################################################# # Tune Parameters ################################################################# estimator = retval['estimator'] params = retval['params'] if tuneForParams: tuneResults = tuneModel(modelname, estimator, params, X_train, y_train, config) estimator = tuneResults['estimator'] params = tuneResults['params'] if refitModel: try: estimator.set_params(probability=True) info("Set probability to True for model refit", ind=4) except: info("Could not set probability to True for model refit") info("Re-fitting for {0} model parameters with probability".format(modelname), ind=4) estimator = estimator.fit(X_train, y_train) info("Finished re-fitting {0} model parameters with probability".format(modelname), ind=4) else: if estimator is not None: info("Fitting for {0} model parameters".format(modelname), ind=2) estimator = estimator.fit(X_train, y_train) info("Finished fitting {0} model parameters".format(modelname), ind=4) else: error("No model with name {0} was trained".format(modelname)) return estimator
def readCSV(filename): info("Reading data [" + filename + "]") pddata = read_csv(filename, low_memory=False) info("Read data with size " + getDim(pddata)) return pddata