def readUptake(config): info("Getting uptake data.") datadir = getDataDir(config) outputConfig = config['output'] compress = outputConfig['compress'] dataName = setFile(datadir, outputConfig['name']) inputConfig = config['input'] trainName = setFile(datadir, inputConfig['train']) validName = setFile(datadir, inputConfig['valid']) featureConfig = config['feature'] dlFile = setFile(datadir, featureConfig['dropList']) if isFile(dataName) and isFile(dlFile): info("Loading previously create data frames") pddf = getJoblib(dataName) else: trainData = readCSV(trainName) validData = readCSV(validName) ## Remove 'market' from validation data validData.drop(labels='market', axis=1, inplace=True) pddf = joinTrainValid(trainData, validData) info("Saving training and validation data") saveJoblib(dataName, pddf, compress) info("Wrote training and validation data to " + dataName) info("Saving feature data") writeDropList(dlFile, pddf, dlData=None) return pddf
def getTrainData(config): X_trainName, X_testName, X_validName, y_trainName, y_testName, y_validName = getTrainTestNames( config) if isFile(X_trainName) and isFile(y_trainName): info("Loading {0}".format(X_trainName), ind=4) X_train = getJoblib(X_trainName) info("Found data that is {0}".format(getDim(X_train)), ind=4) info("Loading {0}".format(y_trainName), ind=4) y_train = getJoblib(y_trainName) info("Found data that is {0}".format(getDim(y_train)), ind=4) return X_train, y_train else: error("Train data is not ready") return None
def runModels(config, models=None, force=False): X_train, X_test, X_valid, y_train, y_test, y_valid = createData(config) if not isinstance(models, list): models = [models] perfs = {} for modelname in models: modelFileName = getModelFileName(config, modelname) if isFile(modelFileName): info("Already have {0} estimator.".format(modelname)) if force is False: continue else: info("Will rerun {0} estimator.".format(modelname)) clf = trainModel(modelname, X_train, y_train, config) tval = testModel(modelname, clf, X_test, config) perf = getModelPerformance(y_test, tval, config) modelResults = { "name": modelname, "estimator": clf, "test": tval, "perf": perf } info("Saving {0} estimator".format(modelname), ind=2) saveTrainedModel(config, modelResults) perfs[modelname] = perf
def dropData(pddf, config): info("Dropping columns", ind=4) basepath = config['basepath'] name = config['name'] dropListFile = config['feature']['dropList'] dname = setSubDir(basepath, ['data', name]) dlFile = setFile(dname, dropListFile) if not isFile(dlFile): info("There is no drop file. Not doing anything.", ind=4) return widths = [ int(x) for x in open(dlFile).readline().replace("\n", "").split(',') ] dlData = read_fwf(dlFile, widths=widths, skiprows=1) drops = dlData['Feature'][dlData['Drop?'] == 1] info("Dropping " + getNrows(drops, asStr=True) + " columns", ind=6) info("Data has " + getNrows(pddf, asStr=True) + " rows and " + getNcols(pddf, asStr=True) + " cols", ind=6) pddf.drop(labels=drops.values, axis=1, inplace=True) info("Data now has " + getNrows(pddf, asStr=True) + " rows and " + getNcols(pddf, asStr=True) + " cols", ind=6)
def analyzeColumns(pddf, config): info("Analyzing " + getNcols(pddf, asStr=True) + " columns to possible drops.", ind=2) targetConfig = config['target'] targetcol = targetConfig['colname'] #problemType = config['problem'] #positiveTarget = config['positiveTarget'] #if isClassification(problemType): # targetData = trainData[targetcol] basepath = config['basepath'] name = config['name'] dropListFile = config['feature']['dropList'] dname = setSubDir(basepath, ['data', name]) dlFile = setFile(dname, dropListFile) if not isFile(dlFile): info("There is no drop file. Not doing anything.", ind=4) return widths = [ int(x) for x in open(dlFile).readline().replace("\n", "").split(',') ] dlData = read_fwf(dlFile, widths=widths, skiprows=1) ## Keep record of overrides overrides = dlData['Feature'][dlData['Drop?'].isnull() == False] ## Set drop to 0 initially dlData['Drop?'].fillna(0, inplace=True) ## Drop anything with high cardinality (>50) dlData['Card'] = dlData['Card'].apply(to_numeric, errors='coerce') dlData['Card'].fillna(0, inplace=True) dlData.loc[dlData['Card'] >= 200, 'Drop?'] = 1 drops = dlData['Feature'][dlData['Drop?'] == 1] ## Drop with more than 20% missing data maxNA = getNrows(pddf) * 0.25 dlData.loc[dlData['nNA'] >= maxNA, 'Drop?'] = 1 drops = dlData['Feature'][dlData['Drop?'] == 1] ## Fill overrides #if getNrows(overrides) > 0: # dlData.loc[dlData['Feature'].isin(overrides['Feature']), 'Drop?'] = overrides['Drop?'] ## Lastly, make sure we don't trop the target dlData.loc[dlData['Feature'] == targetcol, "Drop?"] = 0 ## Show features to drop drops = dlData['Feature'][dlData['Drop?'] == 1] print drops ## Rewrite drop list writeDropList(dlFile, pddf=None, dlData=dlData)
def readKDD99(config): info("Getting KDD '99 data.") datadir = getDataDir(config) outputConfig = config['output'] compress = outputConfig['compress'] dataName = setFile(datadir, outputConfig['name']) featureConfig = config['feature'] dlFile = setFile(datadir, featureConfig['dropList']) if isFile(dataName) and isFile(dlFile): info("Loading previously create data frames") pddf = getJoblib(dataName) else: info("Downloading KDD '99 data", ind=2) tmp = datasets.fetch_kddcup99() X = tmp['data'] y = tmp['target'] y = y.reshape((y.shape[0], 1)) pddf = DataFrame(append(arr=X, values=y, axis=1)) tmp = pddf.head(n=1000) for column in tmp.columns: try: tmp[column].mean() pddf[column] = to_numeric(pddf[column], errors="coerce") except: continue colFile = setFile(datadir, "names.dat") colnames = open(colFile).readlines() targets = colnames[0].split(",") columns = [x.split(":")[0] for x in colnames[1:]] columns.append("TARGET") pddf.columns = columns info("Saving data to {0}".format(dataName)) saveJoblib(jlfile=dataName, jldata=pddf, compress=compress) info("Saving feature data to {0}".format(dlFile)) writeDropList(dlFile, pddf, dlData=None) return pddf
def getMNIST(): datadir = "/Users/tgadfort/Documents/pymva/data" outdir = setDir(datadir, "mnist") names = [ "train-images-idx3-ubyte.gz", "train-labels-idx1-ubyte.gz", "t10k-images-idx3-ubyte.gz", "t10k-labels-idx1-ubyte.gz" ] for name in names: url = "http://yann.lecun.com/exdb/mnist/" + name savename = setFile(outdir, name) if not isFile(savename): urlretrieve(url, savename) statinfo = stat(savename) print('Succesfully downloaded', savename, statinfo.st_size, 'bytes.') name = name.replace(".gz", ".p") npfile = setFile(outdir, name) if not isFile(npfile): data = extract_images(savename) saveJoblib(npfile, data)
def isSplitDataReady(config): X_trainName, X_testName, X_validName, y_trainName, y_testName, y_validName = getTrainTestNames( config) if all([ isFile(X_trainName), isFile(X_testName), isFile(X_validName), isFile(y_trainName), isFile(y_testName), isFile(y_validName) ]): return True else: return False
def loadTrainTestData(config): X_trainName, X_testName, X_validName, y_trainName, y_testName, y_validName = getTrainTestNames( config) if all([ isFile(X_trainName), isFile(X_testName), isFile(X_validName), isFile(y_trainName), isFile(y_testName), isFile(y_validName) ]): info("Loading saved final train/test datasets.", ind=2) info("Loading {0}".format(X_trainName), ind=4) X_train = getJoblib(X_trainName) info("Found data that is {0}".format(getDim(X_train)), ind=4) info("Loading {0}".format(X_testName), ind=4) X_test = getJoblib(X_testName) info("Found data that is {0}".format(getDim(X_test)), ind=4) info("Loading {0}".format(X_validName), ind=4) X_valid = getJoblib(X_validName) info("Found data that is {0}".format(getDim(X_valid)), ind=4) info("Loading {0}".format(y_trainName), ind=4) y_train = getJoblib(y_trainName) info("Found data that is {0}".format(getDim(y_train)), ind=4) info("Loading {0}".format(y_testName), ind=4) y_test = getJoblib(y_testName) info("Found data that is {0}".format(getDim(y_test)), ind=4) info("Loading {0}".format(y_validName), ind=4) y_valid = getJoblib(y_validName) info("Found data that is {0}".format(getDim(y_valid)), ind=4) return X_train, X_test, X_valid, y_train, y_test, y_valid else: error("Train/test datasets are not ready!")