def clusters(dataSet, features, exemplars): ftrs = list(features) wI = wellIndex(dataSet) objectCount = len(wI) predicted = np.empty(objectCount, dtype=np.int) # Default to use all features if none habe been selected. if not ftrs: ftrs = data.imageFeatures(dataSet) if ftrs and exemplars: #(exemplars or wellTypes(dataSet)): # Training feature data. valueMatrix = np.matrix([scaledArray(dataSet, ftr) for ftr in ftrs], copy=False).transpose() # Construct from well type annotation. trainingLabels = np.copy(wI['type'].values) # Knock out large part of training values (to speed up training). trainingSample = np.random.rand(trainingLabels.size) < configuration(dataSet).wellTypeSample trainingLabels = np.where(trainingSample, trainingLabels, np.nan) # Override well type annotations where exemplars have been chosen by user. exemplarDict = dict(exemplars) for popId, exemplars in exemplarDict.iteritems(): for exemplar in exemplars: trainingLabels[exemplar] = popId # Prune training features and labels, based on presence of labels. trainingValues = valueMatrix[~np.isnan(trainingLabels)] trainingLabels = trainingLabels[~np.isnan(trainingLabels)] print "Begin training" #trainingValues = np.take(valueMatrix, exemplarObjects, axis=0) forest = RandomForestClassifier( n_estimators=10, n_jobs=-1, class_weight="balanced"#, #min_samples_split=0.01*trainingValues.size ) forest = forest.fit(trainingValues, trainingLabels) #forest.fit(trainingValues, exemplarLabels) print "End training" print "Begin classification" #predicted = forest.predict(valueMatrix) confidenceThreshold = data.config(dataSet).classifierConfidenceThreshold probabilities = forest.predict_proba(valueMatrix) maxProb = np.max(probabilities, axis=1) maxArgProb = np.argmax(probabilities, axis=1) predicted = np.where(maxProb > confidenceThreshold, np.choose(maxArgProb, forest.classes_), 2).astype(np.int) print "End classification" else: predicted.fill(2) # 2 unsure about all input when no training input is provided # Partition predicted column to object indices. return predicted
def featureOrdering(dataSet): from ordering.rearrange import rearrange print "Order features by correlation" objectSet = selectImageFeatures(dataSet, smallSample(dataSet)) corr = objectSet.corr() distances = 1 - corr.abs() rearrangedSubset = rearrange(distances.values) ftrs = data.imageFeatures(dataSet) return [ftrs[i] for i in rearrangedSubset]
def featureHistograms(dataSet, featureSet, exemplars, bins): partition = clustersAsMap(dataSet, featureSet, exemplars) # All computation combinations. #print "Compute feature histograms." tasks = [(dataSet, featureSet, exemplars, feature, cluster, bins) for feature in data.imageFeatures(dataSet) for cluster, clusterMap in partition.iteritems()] pool = Pool() results = pool.imap(featureHistogram, tasks) pool.close() pool.join() histograms = {c: {} for c, table in partition.iteritems()} for feature, cluster, histogram in results: histograms[cluster][feature] = histogram #print "Finish compute feature histograms." return histograms
def featureColumns(dataSet): def column(colName): col = data.numpyDump(dataSet, colName).astype(np.float64) return (col - np.mean(col)) / np.std(col) return np.array([column(col) for col in data.imageFeatures(dataSet)])
def selectImageFeatures(dataSet, subset): return subset[data.imageFeatures(dataSet)]