def main(date, modelType): """ Runs the training script. Trains the specified model type, saves the model to a prefined location (specified in the Constants file), and runs basic accuracy tests on the trained model. :param date: Date the training and testing data was collected (YYYY_MMDD) :param modelType: (string) type of machine learning model to train :return: (None) """ # Make sure that the model is a valid choice if (not (modelType in MODELS.keys())) and (modelType != ALL): print "Invalid model type:", modelType return # Allow for training more than one model at a time if modelType == ALL: modelsToTrain = MODELS.keys() else: modelsToTrain = [modelType] # Load the training and testing data into memory trainX, trainY = FileIO.loadTrainingData(date) testX, testY = FileIO.loadTestingData(date) trainX = np.nan_to_num(trainX) testX = np.nan_to_num(testX) for modelType in modelsToTrain: # Train the desired ML model name, clfType = MODELS[modelType] hyperparameters = HYPERPARAMETERS[modelType] print "Training the", name clf = clfType(**hyperparameters) clf.fit(trainX, trainY) # Perform some very basic accuracy testing trainResult = clf.predict(trainX) testResult = clf.predict(testX) trainingAccuracy = accuracy_score(trainY, trainResult) testingAccuracy = accuracy_score(testY, testResult) confusionMatrix = confusion_matrix(testY, testResult) print "Training Accuracy:", trainingAccuracy print "Testing Accuracy:", testingAccuracy print "Confusion Matrix:" print confusionMatrix print " " # Save the model to disk FileIO.saveModel(clf, modelType, date)
def main(date): """ Trains a random forest and extracts the feature importances. :param date: Date the training and testing data was collected (YYYY_MMDD) :return: (None) """ # Load the training data into memory trainX, trainY = FileIO.loadTrainingData(date) trainX = np.nan_to_num(trainX) # Train the random forest on the training data numCores = multiprocessing.cpu_count() forest = RandomForestClassifier(n_estimators=500, random_state=0, n_jobs=numCores) forest.fit(trainX, trainY) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Plot the feature importances of the forest plt.figure() plt.title("Feature importances") plt.bar(range(trainX.shape[1]), importances[indices], color="r", align="center") plt.xticks(range(trainX.shape[1]), indices) plt.xlim([-1, trainX.shape[1]]) plt.show()
def main(date): """ Runs linear regression (classification) between the herbicide resistance classes based on all wavelengths. The weights associated with each wavelength are then plotted, allowing the user to see the contribution to classification by each wavelength. :param date: (string) Data collection date YYYY_MMDD :return: (None) """ # Load the training data from disk X, y = FileIO.loadTrainingData(date) X = np.nan_to_num(X) # Train the classifier on the loaded data clf = SGDClassifier() clf.fit(X, y) # Plot the feature weights to visualize feature contributions featureWeights = np.fabs(clf.coef_) for i in xrange(3): plt.plot(WAVELENGTHS, featureWeights[i]) plt.title("Linear Classifier Weights for " + RESISTANCE_STRINGS[INDEX_TO_LABEL[i]] + " vs Others") plt.xlabel("Wavelength (nm)") plt.ylabel("Absolute Weight") plt.show()
def main(date, delete, keywords=[], byLeaf=True, saveProportion=0.5): """ Generates ML training and testing data from extracted CSV files :param date: (string) Data collection date YYYY_MMDD :param delete: (boolean) Determines whether or not to delete the existing training/testing data files :param keywords: (list of strings) Data filename keywords :param byLeaf: (boolean) Should we separate the train/test data by leaf, or should we randomly separate the data according to a set proportion? :param saveProportion: (float) Amount of each CSV file to save as training and testing data. :return: (None) """ # Get the data files we will be looking at dataPath = DATA_DIRECTORIES[date] dataFilenames = FileIO.getDatafileNames(dataPath, keywords) # If desired, remove the old training data and start fresh if delete: mlDataPath = DATA_DIRECTORIES[date+"_ML"] trainingDataPath = os.path.join(mlDataPath, TRAINING_DATA_PATH) testingDataPath = os.path.join(mlDataPath, TESTING_DATA_PATH) sampleCountsPath = os.path.join(mlDataPath, SAMPLE_COUNTS_PATH) if os.path.exists(trainingDataPath): os.remove(trainingDataPath) if os.path.exists(testingDataPath): os.remove(testingDataPath) if os.path.exists(sampleCountsPath): os.remove(sampleCountsPath) # Consolidate the CSV files into training and testing data (train_X, train_y, test_X, test_y) = DataManipulation.separateTrainTest(dataPath, dataFilenames, byLeaf=byLeaf, saveProportion=saveProportion) # Save the training and testing data in the proper spot FileIO.saveTrainingData(date, train_X, train_y) FileIO.saveTestingData(date, test_X, test_y)
def main(date, takeSubset=False): """ Reduces the dimensionality of the training data to 3 dimensions, plots the transformed data in 3d space. The idea is to bring out separability between the resistance classes which may be hidden in the dimensionality of the data. :param date: (string) Data collection date YYYY_MMDD :param takeSubset: (boolean) Transform and plot a random subset of the trainng data? :return: (None) """ mkl.set_num_threads(8) # Load the training and testing data into memory trainX, trainY = FileIO.loadTrainingData(date) if takeSubset: indices = np.random.choice(range(0, len(trainY)), size=NUM_SAMPLES, replace=False) X = trainX[indices,:] y = trainY[indices] else: X = trainX y = trainY X = np.nan_to_num(X) # Break the data into resistance classes susIndex = Constants.LABEL_TO_INDEX[Constants.SUSCEPTIBLE] drIndex = Constants.LABEL_TO_INDEX[Constants.DR_RESISTANT] grIndex = Constants.LABEL_TO_INDEX[Constants.GR_RESISTANT] susX = X[y==susIndex, :] drX = X[y==drIndex, :] grX = X[y==grIndex, :] # Transform the data using PCA pca = IncrementalPCA(n_components=6) pointsSUS = pca.fit_transform(susX) pointsGR= pca.fit_transform(grX) pointsDR = pca.fit_transform(drX) # Plot the transformed data in 3D space traceSUS = go.Scatter3d( x=pointsSUS[:, 0], y=pointsSUS[:, 1], z=pointsSUS[:, 2], mode='markers', marker=dict( size=5, line=dict( color='rgba(255, 0, 0, 0)', width=0.1 ), opacity=0 ) ) traceDR = go.Scatter3d( x=pointsDR[:, 0], y=pointsDR[:, 1], z=pointsDR[:, 2], mode='markers', marker=dict( size=5, line=dict( color='rgba(0, 255, 0, 0)', width=0.1 ), opacity=0 ) ) traceGR = go.Scatter3d( x=pointsGR[:, 0], y=pointsGR[:, 1], z=pointsGR[:, 2], mode='markers', marker=dict( size=5, line=dict( color='rgba(0, 0, 255, 0)', width=0.1 ), opacity=0 ) ) data = [traceSUS, traceDR, traceGR] fig = go.Figure(data=data) py.iplot(fig, filename='3D PCA Wavelength Plot') # Plot the principle components eigenSpectra = pca.components_ plt.subplot(3,1,1) plt.plot(Constants.WAVELENGTHS, eigenSpectra[0, :]) plt.title("Principle Components 1 - 3") plt.subplot(3,1,2) plt.plot(Constants.WAVELENGTHS, eigenSpectra[1, :]) plt.subplot(3,1,3) plt.plot(Constants.WAVELENGTHS, eigenSpectra[2, :]) plt.xlabel("Wavelength (nm)") plt.show() plt.clf() plt.subplot(3,1,1) plt.plot(Constants.WAVELENGTHS, eigenSpectra[3, :]) plt.title("Principle Components 4 - 6") plt.subplot(3,1,2) plt.plot(Constants.WAVELENGTHS, eigenSpectra[4, :]) plt.subplot(3,1,3) plt.plot(Constants.WAVELENGTHS, eigenSpectra[5, :]) plt.xlabel("Wavelength (nm)") plt.show()
def main(date, modelType, iterations): """ Determines the optimal hyperparameters for a given machine learning model for a set of training data. :param date: Date the training and testing data was collected (YYYY_MMDD) :param modelType: (string) type of machine learning model to train :param iterations: (int) number of iterations for hyperparameter searching :return: (None) """ # Make sure that the model is a valid choice if (not (modelType in MODELS.keys())) and (modelType != ALL): print "Invalid model type:", modelType return # Allow for training more than one model at a time if modelType == ALL: modelsToTrain = MODELS.keys() else: modelsToTrain = [modelType] # Load the training and testing data into memory trainX, trainY = FileIO.loadTrainingData(date) testX, testY = FileIO.loadTestingData(date) trainX = np.nan_to_num(trainX) testX = np.nan_to_num(testX) for modelType in modelsToTrain: # Train the desired ML model name, clfType = MODELS[modelType] print "Training the", name baseClassifier = clfType() clf = RandomizedSearchCV(baseClassifier, param_distributions=PARAMETERS[modelType], n_iter=iterations, n_jobs=4) clf.fit(trainX, trainY) # Perform some very basic accuracy testing trainResult = clf.predict(trainX) testResult = clf.predict(testX) trainingAccuracy = accuracy_score(trainY, trainResult) testingAccuracy = accuracy_score(testY, testResult) confusionMatrix = confusion_matrix(testY, testResult) print "Training Accuracy:", trainingAccuracy print "Testing Accuracy:", testingAccuracy print "Confusion Matrix:" print confusionMatrix print " " print "Hyperparameters:" for param in PARAMETERS[modelType].keys(): print param + ':', clf.best_estimator_.get_params()[param] print " " # Save the model to disk FileIO.saveModel(clf.best_estimator_, modelType, date)
def main(date, wavelengths, keywords=[], allSpectra=False): """ Plot three wavelengths against each other from a specified set of data. :param date: (string) Data collection date YYYY_MMDD :param wavelengths: (3-tuple) Wavelengths to plot against another :param keywords: (list of strings) Strings which should be included in the filenames of files being plotted :allSpectra: (boolean) Determines where there is one point for every spectra collected, or one point for every leaf file :return: (None) """ # Convert the wavelengths to indices for accessing the data wavelengthIndices = map(wavelengthToIndex, wavelengths) wavelengthIndex1 = wavelengthIndices[0] wavelengthIndex2 = wavelengthIndices[1] wavelengthIndex3 = wavelengthIndices[2] # Get the data files we will be looking at dataPath = DATA_DIRECTORIES[date] filesToPlot = FileIO.getDatafileNames(dataPath, keywords) pointsDR = [] pointsGR = [] pointsSUS = [] for name in filesToPlot: tokens = name[0:-4].split('_') map(lambda x: x.lower(), tokens) plant = tokens[0] resistance = tokens[1] filePath = os.path.join(dataPath, name) data = FileIO.loadCSV(filePath) try: rows, columns = data.shape if columns < 2: continue except: continue if allSpectra: xValues = data[:, wavelengthIndex1] yValues = data[:, wavelengthIndex2] zValues = data[:, wavelengthIndex3] points = np.zeros((rows, 3)) points[:, 0] = xValues points[:, 1] = yValues points[:, 2] = zValues if resistance == SUSCEPTIBLE: if pointsSUS == []: pointsSUS = points else: pointsSUS = np.append(pointsSUS, points, axis=0) elif resistance == DR_RESISTANT: if pointsDR == []: pointsDR = points else: pointsDR = np.append(pointsDR, points, axis=0) elif resistance == GR_RESISTANT: if pointsGR == []: pointsGR = points else: pointsGR = np.append(pointsGR, points, axis=0) else: raise Exception("Unknown resistance type: " + resistance) else: mean = np.mean(data, axis=0) meanValue1 = mean[wavelengthIndex1] meanValue2 = mean[wavelengthIndex2] meanValue3 = mean[wavelengthIndex3] if resistance == SUSCEPTIBLE: pointsSUS.append([meanValue1, meanValue2, meanValue3]) elif resistance == DR_RESISTANT: pointsDR.append([meanValue1, meanValue2, meanValue3]) elif resistance == GR_RESISTANT: pointsGR.append([meanValue1, meanValue2, meanValue3]) else: raise Exception("Unknown resistance type: " + resistance) # Plot the wavelengths pointsDR = np.array(pointsDR) pointsGR = np.array(pointsGR) pointsSUS = np.array(pointsSUS) traceSUS = plotPoints(pointsSUS, RESISTANCE_STRINGS[SUSCEPTIBLE], 'rgba(255, 0, 0, 0)') traceDR = plotPoints(pointsDR, RESISTANCE_STRINGS[DR_RESISTANT], 'rgba(0, 255, 0, 0)') traceGR = plotPoints(pointsGR, RESISTANCE_STRINGS[GR_RESISTANT], 'rgba(0, 0, 255, 0)') layout = go.Layout( title='3D Wavelength Plot', scene=go.Scene( xaxis=go.XAxis(title='Reflectance @ ' + str(wavelengths[0]) + ' nm'), yaxis=go.YAxis(title='Reflectance @ ' + str(wavelengths[1]) + ' nm'), zaxis=go.ZAxis(title='Reflectance @ ' + str(wavelengths[2]) + ' nm') ) ) data = [traceSUS, traceDR, traceGR] fig = go.Figure(data=data, layout=layout) py.iplot(fig, filename='3D Wavelength Plot')
def main(date, wavelengths, plotLeaves, binning, keywords=[]): """ Plot the histogram of a specified list of wavelengths. :param date: (string) Data collection date YYYY_MMDD :param wavelengths: (list) Wavelengths to plot histograms :param plotLeaves: (boolean) Plot only a single point per leaf vs. all spectra in a leaf :param binning: (float) Wavelength binning width (in nm) :param keywords: (list of strings) Strings which should be included in the filenames of files being plotted :return: (None) """ numHistograms = len(wavelengths) # Get the data files we will be looking at dataPath = DATA_DIRECTORIES[date] filesToPlot = FileIO.getDatafileNames(dataPath, keywords) pointsDR = np.zeros((1, numHistograms)) pointsGR = np.zeros((1, numHistograms)) pointsSUS = np.zeros((1, numHistograms)) for name in filesToPlot: tokens = name[0:-4].split('_') map(lambda x: x.lower(), tokens) plant = tokens[0] resistance = tokens[1] imageType = tokens[2] index = int(tokens[4]) filePath = os.path.join(dataPath, name) data = FileIO.loadCSV(filePath) # Extract the relevant data from the spectra in the data file try: if not binning: wavelengthIndices = map(wavelengthToIndex, wavelengths) histogramData = data[:, wavelengthIndices] else: indexRegions = map(lambda x: wavelengthRegionToIndices(x, binning), wavelengths) rows, columns = data.shape histogramData = np.zeros((rows, numHistograms)) for i in xrange(numHistograms): histogramData[:, i] = map(lambda j: np.mean(data[j,indexRegions[i]]), xrange(rows)) except Exception, e: print "Error with file:", name continue if plotLeaves: meanLeaf = map(lambda i: np.mean(histogramData[:,i]), xrange(numHistograms)) if resistance == SUSCEPTIBLE: pointsSUS = np.append(pointsSUS, [meanLeaf], axis=0) elif resistance == DR_RESISTANT: pointsDR = np.append(pointsDR, [meanLeaf], axis=0) elif resistance == GR_RESISTANT: pointsGR = np.append(pointsGR, [meanLeaf], axis=0) else: raise Exception("Unknown resistance type: " + resistance) else: if resistance == SUSCEPTIBLE: pointsSUS = np.append(pointsSUS, histogramData, axis=0) elif resistance == DR_RESISTANT: pointsDR = np.append(pointsDR, histogramData, axis=0) elif resistance == GR_RESISTANT: pointsGR = np.append(pointsGR, histogramData, axis=0) else: raise Exception("Unknown resistance type: " + resistance)
def main(date, takeSubset=False): """ Reduces the dimensionality of the training data to 3 dimensions, plots the transformed data in 3d space. The idea is to bring out separability between the resistance classes which may be hidden in the dimensionality of the data. :param date: (string) Data collection date YYYY_MMDD :param takeSubset: (boolean) Transform and plot a random subset of the trainng data? :return: (None) """ mkl.set_num_threads(8) # Load the training and testing data into memory trainX, trainY = FileIO.loadTrainingData(date) if takeSubset: indices = np.random.choice(range(0, len(trainY)), size=NUM_SAMPLES, replace=False) X = trainX[indices, :] y = trainY[indices] else: X = trainX y = trainY X = np.nan_to_num(X) # Break the data into resistance classes susIndex = Constants.LABEL_TO_INDEX[Constants.SUSCEPTIBLE] drIndex = Constants.LABEL_TO_INDEX[Constants.DR_RESISTANT] grIndex = Constants.LABEL_TO_INDEX[Constants.GR_RESISTANT] susX = X[y == susIndex, :] drX = X[y == drIndex, :] grX = X[y == grIndex, :] # Transform the data using PCA pca = IncrementalPCA(n_components=6) pointsSUS = pca.fit_transform(susX) pointsGR = pca.fit_transform(grX) pointsDR = pca.fit_transform(drX) # Plot the transformed data in 3D space traceSUS = go.Scatter3d(x=pointsSUS[:, 0], y=pointsSUS[:, 1], z=pointsSUS[:, 2], mode='markers', marker=dict(size=5, line=dict(color='rgba(255, 0, 0, 0)', width=0.1), opacity=0)) traceDR = go.Scatter3d(x=pointsDR[:, 0], y=pointsDR[:, 1], z=pointsDR[:, 2], mode='markers', marker=dict(size=5, line=dict(color='rgba(0, 255, 0, 0)', width=0.1), opacity=0)) traceGR = go.Scatter3d(x=pointsGR[:, 0], y=pointsGR[:, 1], z=pointsGR[:, 2], mode='markers', marker=dict(size=5, line=dict(color='rgba(0, 0, 255, 0)', width=0.1), opacity=0)) data = [traceSUS, traceDR, traceGR] fig = go.Figure(data=data) py.iplot(fig, filename='3D PCA Wavelength Plot') # Plot the principle components eigenSpectra = pca.components_ plt.subplot(3, 1, 1) plt.plot(Constants.WAVELENGTHS, eigenSpectra[0, :]) plt.title("Principle Components 1 - 3") plt.subplot(3, 1, 2) plt.plot(Constants.WAVELENGTHS, eigenSpectra[1, :]) plt.subplot(3, 1, 3) plt.plot(Constants.WAVELENGTHS, eigenSpectra[2, :]) plt.xlabel("Wavelength (nm)") plt.show() plt.clf() plt.subplot(3, 1, 1) plt.plot(Constants.WAVELENGTHS, eigenSpectra[3, :]) plt.title("Principle Components 4 - 6") plt.subplot(3, 1, 2) plt.plot(Constants.WAVELENGTHS, eigenSpectra[4, :]) plt.subplot(3, 1, 3) plt.plot(Constants.WAVELENGTHS, eigenSpectra[5, :]) plt.xlabel("Wavelength (nm)") plt.show()