def test_classify_elements_all_preexisting_overwrite(self): """ Test generating classification elements where all elements generated by the factory claim to already have classifications but overwrite is True this time.""" d_elems = [ DescriptorMemoryElement('', i).set_vector(v) for i, v in enumerate([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) ] # Mock a factory to produce elements whose ``has_classifications`` # method returns False. m_ce_type = mock.MagicMock(name="MockedClassificationElementType") c_factory = ClassificationElementFactory(m_ce_type, {}) # Mocking that elements have no classifications set m_ce_inst = m_ce_type.from_config() m_ce_inst.has_classifications.return_value = True list( self.inst.classify_elements(d_elems, factory=c_factory, overwrite=True)) # Method not called becuase of overwrite short-circuit assert m_ce_inst.has_classifications.call_count == 0 assert m_ce_inst.set_classification.call_count == 3 # Check that expected classification returns from dummy generator were # set to factory-created elements. m_ce_inst.set_classification.assert_any_call({'test': 1}) m_ce_inst.set_classification.assert_any_call({'test': 4}) m_ce_inst.set_classification.assert_any_call({'test': 7}) # Dummy classifier iterator completed to the end. self.inst._post_iterator_check.assert_called_once()
def test_classify_elements_all_preexisting(self): """ Test generating classification elements where all elements generated by the factory claim to already have classifications and overwrite is False.""" d_elems = [ DescriptorMemoryElement('', i).set_vector(v) for i, v in enumerate([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) ] # Mock a factory to produce elements whose ``has_classifications`` # method returns False. m_ce_type = mock.MagicMock(name="MockedClassificationElementType") c_factory = ClassificationElementFactory(m_ce_type, {}) # Mocking that elements have no classifications set m_ce_inst = m_ce_type.from_config() m_ce_inst.has_classifications.return_value = True list( self.inst.classify_elements(d_elems, factory=c_factory, overwrite=False)) assert m_ce_inst.has_classifications.call_count == 3 m_ce_inst.set_classification.assert_not_called() # Dummy classifier iterator completed to the end. self.inst._post_iterator_check.assert_called_once()
def test_classify_elements_none_preexisting(self): """ Test generating classification elements where none generated by the factory have existing vectors. i.e. all descriptor elements passed to underlying classification method.""" d_elems = [ DescriptorMemoryElement('', i).set_vector(v) for i, v in enumerate([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) ] # Mock a factory to produce elements whose ``has_classifications`` # method returns False. m_ce_type = mock.MagicMock(name="MockedClassificationElementType") c_factory = ClassificationElementFactory(m_ce_type, {}) # Mocking that elements have no classifications set m_ce_inst = m_ce_type.from_config() m_ce_inst.has_classifications.return_value = False list( self.inst.classify_elements(d_elems, factory=c_factory, overwrite=False)) assert m_ce_inst.has_classifications.call_count == 3 assert m_ce_inst.set_classification.call_count == 3 # Check that expected classification returns from dummy generator were # set to factory-created elements. m_ce_inst.set_classification.assert_any_call({'test': 1}) m_ce_inst.set_classification.assert_any_call({'test': 4}) m_ce_inst.set_classification.assert_any_call({'test': 7}) # Dummy classifier iterator completed to the end. self.inst._post_iterator_check.assert_called_once()
plt.ylabel("Precision") plt.legend(loc='best', fancybox=True, framealpha=0.5) plt.savefig(PLOT_PR_OUTPUT) else: # Using the final trained classifier with open(CLASSIFIER_TRAINING_CONFIG_JSON) as f: classifier_config = json.load(f) log.info("Loading plugins") descriptor_index = MemoryDescriptorIndex( file_cache=DESCRIPTOR_INDEX_FILE_CACHE) #: :type: smqtk.algorithms.Classifier classifier = from_plugin_config(classifier_config['plugins']['classifier'], get_classifier_impls()) c_factory = ClassificationElementFactory(MemoryClassificationElement, {}) #: :type: dict[str, list[str]] phone2shas = json.load(open(PHONE_SHA1_JSON)) #: :type: dict[str, float] phone2score = {} log.info("Classifying phone imagery descriptors") i = 0 descriptor_index_shas = set(descriptor_index.iterkeys()) for p in phone2shas: log.info('%s (%d / %d)', p, i + 1, len(phone2shas)) # Not all source "images" have descriptors since some URLs returned # non-image files. Intersect phone sha's with what was actually # computed. Warn if this reduces descriptors for classification to zero. indexed_shas = set(phone2shas[p]) & descriptor_index_shas
def test_simple_classification(self): """ simple LibSvmClassifier test - 2-class Test libSVM classification functionality using random constructed data, training the y=0.5 split """ DIM = 2 N = 1000 POS_LABEL = 'positive' NEG_LABEL = 'negative' p = multiprocessing.pool.ThreadPool() d_factory = DescriptorElementFactory(DescriptorMemoryElement, {}) c_factory = ClassificationElementFactory( MemoryClassificationElement, {}) def make_element(argtup): (i, v) = argtup d = d_factory.new_descriptor('test', i) d.set_vector(v) return d # Constructing artificial descriptors x = numpy.random.rand(N, DIM) x_pos = x[x[:, 1] <= 0.45] x_neg = x[x[:, 1] >= 0.55] d_pos = p.map(make_element, enumerate(x_pos)) d_neg = p.map(make_element, enumerate(x_neg, start=N // 2)) # Create/Train test classifier classifier = LibSvmClassifier( train_params={ '-t': 0, # linear kernel '-b': 1, # enable probability estimates '-c': 2, # SVM-C parameter C '-q': '', # quite mode }, normalize=None, # DO NOT normalize descriptors ) classifier.train({POS_LABEL: d_pos, NEG_LABEL: d_neg}) # Test classifier x = numpy.random.rand(N, DIM) x_pos = x[x[:, 1] <= 0.45] x_neg = x[x[:, 1] >= 0.55] d_pos = p.map(make_element, enumerate(x_pos, N)) d_neg = p.map(make_element, enumerate(x_neg, N + N // 2)) d_pos_sync = {} # for comparing to async for d in d_pos: c = classifier.classify(d, c_factory) ntools.assert_equal( c.max_label(), POS_LABEL, "Found False positive: %s :: %s" % (d.vector(), c.get_classification())) d_pos_sync[d] = c d_neg_sync = {} for d in d_neg: c = classifier.classify(d, c_factory) ntools.assert_equal( c.max_label(), NEG_LABEL, "Found False negative: %s :: %s" % (d.vector(), c.get_classification())) d_neg_sync[d] = c # test that async classify produces the same results # -- d_pos m_pos = classifier.classify_async(d_pos, c_factory) ntools.assert_equal( m_pos, d_pos_sync, "Async computation of pos set did not yield " "the same results as synchronous " "classification.") # -- d_neg m_neg = classifier.classify_async(d_neg, c_factory) ntools.assert_equal( m_neg, d_neg_sync, "Async computation of neg set did not yield " "the same results as synchronous " "classification.") # -- combined -- threaded combined_truth = dict(d_pos_sync.items()) combined_truth.update(d_neg_sync) m_combined = classifier.classify_async( d_pos + d_neg, c_factory, use_multiprocessing=False, ) ntools.assert_equal( m_combined, combined_truth, "Async computation of all test descriptors " "did not yield the same results as " "synchronous classification.") # -- combined -- multiprocess m_combined = classifier.classify_async( d_pos + d_neg, c_factory, use_multiprocessing=True, ) ntools.assert_equal( m_combined, combined_truth, "Async computation of all test descriptors " "(mixed order) did not yield the same results " "as synchronous classification.") # Closing resources p.close() p.join()
def test_no_save_model_pickle(self): # Test model preservation across pickling even without model cache # file paths set. classifier = LibSvmClassifier( train_params={ '-t': 0, # linear kernel '-b': 1, # enable probability estimates '-c': 2, # SVM-C parameter C '-q': '', # quite mode }, normalize=None, # DO NOT normalize descriptors ) ntools.assert_true(classifier.svm_model is None) # Empty model should not trigger __LOCAL__ content in pickle ntools.assert_not_in('__LOCAL__', classifier.__getstate__()) _ = cPickle.loads(cPickle.dumps(classifier)) # train arbitrary model (same as ``test_simple_classification``) DIM = 2 N = 1000 POS_LABEL = 'positive' NEG_LABEL = 'negative' d_factory = DescriptorElementFactory(DescriptorMemoryElement, {}) c_factory = ClassificationElementFactory( MemoryClassificationElement, {}) def make_element(argtup): (i, v) = argtup d = d_factory.new_descriptor('test', i) d.set_vector(v) return d # Constructing artificial descriptors x = numpy.random.rand(N, DIM) x_pos = x[x[:, 1] <= 0.45] x_neg = x[x[:, 1] >= 0.55] p = multiprocessing.pool.ThreadPool() d_pos = p.map(make_element, enumerate(x_pos)) d_neg = p.map(make_element, enumerate(x_neg, start=N // 2)) p.close() p.join() # Training classifier.train({POS_LABEL: d_pos, NEG_LABEL: d_neg}) # Test original classifier t_v = numpy.random.rand(DIM) t = d_factory.new_descriptor('query', 0) t.set_vector(t_v) c_expected = classifier.classify(t, c_factory) # Should see __LOCAL__ content in pickle state now p_state = classifier.__getstate__() ntools.assert_in('__LOCAL__', p_state) ntools.assert_in('__LOCAL_LABELS__', p_state) ntools.assert_in('__LOCAL_MODEL__', p_state) ntools.assert_true(len(p_state['__LOCAL_LABELS__']) > 0) ntools.assert_true(len(p_state['__LOCAL_MODEL__']) > 0) # Restored classifier should classify the same test descriptor the # same #: :type: LibSvmClassifier classifier2 = cPickle.loads(cPickle.dumps(classifier)) c_post_pickle = classifier2.classify(t, c_factory) # There may be floating point error, so extract actual confidence # values and check post round c_pp_positive = c_post_pickle[POS_LABEL] c_pp_negative = c_post_pickle[NEG_LABEL] c_e_positive = c_expected[POS_LABEL] c_e_negative = c_expected[NEG_LABEL] ntools.assert_almost_equal(c_e_positive, c_pp_positive, 5) ntools.assert_almost_equal(c_e_negative, c_pp_negative, 5)
def test_simple_multiclass_classification(self): """ simple LibSvmClassifier test - 3-class Test libSVM classification functionality using random constructed data, training the y=0.33 and y=.66 split """ DIM = 2 N = 1000 P1_LABEL = 'p1' P2_LABEL = 'p2' P3_LABEL = 'p3' p = multiprocessing.pool.ThreadPool() d_factory = DescriptorElementFactory(DescriptorMemoryElement, {}) c_factory = ClassificationElementFactory( MemoryClassificationElement, {}) di = 0 def make_element(argtup): (i, v) = argtup d = d_factory.new_descriptor('test', i) d.set_vector(v) return d # Constructing artificial descriptors x = numpy.random.rand(N, DIM) x_p1 = x[x[:, 1] <= 0.30] x_p2 = x[(x[:, 1] >= 0.36) & (x[:, 1] <= 0.63)] x_p3 = x[x[:, 1] >= 0.69] d_p1 = p.map(make_element, enumerate(x_p1, di)) di += len(d_p1) d_p2 = p.map(make_element, enumerate(x_p2, di)) di += len(d_p2) d_p3 = p.map(make_element, enumerate(x_p3, di)) di += len(d_p3) # Create/Train test classifier classifier = LibSvmClassifier( train_params={ '-t': 0, # linear kernel '-b': 1, # enable probability estimates '-c': 2, # SVM-C parameter C '-q': '' # quite mode }, normalize=None, # DO NOT normalize descriptors ) classifier.train({P1_LABEL: d_p1, P2_LABEL: d_p2, P3_LABEL: d_p3}) # Test classifier x = numpy.random.rand(N, DIM) x_p1 = x[x[:, 1] <= 0.30] x_p2 = x[(x[:, 1] >= 0.36) & (x[:, 1] <= 0.63)] x_p3 = x[x[:, 1] >= 0.69] d_p1 = p.map(make_element, enumerate(x_p1, di)) di += len(d_p1) d_p2 = p.map(make_element, enumerate(x_p2, di)) di += len(d_p2) d_p3 = p.map(make_element, enumerate(x_p3, di)) di += len(d_p3) d_p1_sync = {} for d in d_p1: c = classifier.classify(d, c_factory) ntools.assert_equal( c.max_label(), P1_LABEL, "Incorrect %s label: %s :: %s" % (P1_LABEL, d.vector(), c.get_classification())) d_p1_sync[d] = c d_p2_sync = {} for d in d_p2: c = classifier.classify(d, c_factory) ntools.assert_equal( c.max_label(), P2_LABEL, "Incorrect %s label: %s :: %s" % (P2_LABEL, d.vector(), c.get_classification())) d_p2_sync[d] = c d_neg_sync = {} for d in d_p3: c = classifier.classify(d, c_factory) ntools.assert_equal( c.max_label(), P3_LABEL, "Incorrect %s label: %s :: %s" % (P3_LABEL, d.vector(), c.get_classification())) d_neg_sync[d] = c # test that async classify produces the same results # -- p1 async_p1 = classifier.classify_async(d_p1, c_factory) ntools.assert_equal( async_p1, d_p1_sync, "Async computation of p1 set did not yield " "the same results as synchronous computation.") # -- p2 async_p2 = classifier.classify_async(d_p2, c_factory) ntools.assert_equal( async_p2, d_p2_sync, "Async computation of p2 set did not yield " "the same results as synchronous computation.") # -- neg async_neg = classifier.classify_async(d_p3, c_factory) ntools.assert_equal( async_neg, d_neg_sync, "Async computation of neg set did not yield " "the same results as synchronous computation.") # -- combined -- threaded sync_combined = dict(d_p1_sync.items()) sync_combined.update(d_p2_sync) sync_combined.update(d_neg_sync) async_combined = classifier.classify_async( d_p1 + d_p2 + d_p3, c_factory, use_multiprocessing=False) ntools.assert_equal( async_combined, sync_combined, "Async computation of all test descriptors " "did not yield the same results as " "synchronous classification.") # -- combined -- multiprocess async_combined = classifier.classify_async( d_p1 + d_p2 + d_p3, c_factory, use_multiprocessing=True) ntools.assert_equal( async_combined, sync_combined, "Async computation of all test descriptors " "(mixed order) did not yield the same results " "as synchronous classification.") # Closing resources p.close() p.join()
from smqtk.representation import ClassificationElementFactory from smqtk.representation.classification_element.memory import \ MemoryClassificationElement # Default classifier element factory for interfaces. DFLT_CLASSIFIER_FACTORY = ClassificationElementFactory( MemoryClassificationElement, {} )
def test_simple_multiclass_classification(self): """ Test libSVM classification functionality using random constructed data, training the y=0.33 and y=.66 split """ DIM = 2 N = 1000 P1_LABEL = 'p1' P2_LABEL = 'p2' p = multiprocessing.pool.ThreadPool() d_factory = DescriptorElementFactory(DescriptorMemoryElement, {}) c_factory = ClassificationElementFactory(MemoryClassificationElement, {}) di = 0 def make_element((i, v)): d = d_factory.new_descriptor('test', i) d.set_vector(v) return d # Constructing artificial descriptors x = numpy.random.rand(N, DIM) x_p1 = x[x[:, 1] <= 0.30] x_p2 = x[(x[:, 1] >= 0.36) & (x[:, 1] <= 0.63)] x_neg = x[x[:, 1] >= 0.69] d_p1 = p.map(make_element, enumerate(x_p1, di)) di += len(d_p1) d_p2 = p.map(make_element, enumerate(x_p2, di)) di += len(d_p2) d_neg = p.map(make_element, enumerate(x_neg, di)) di += len(d_neg) # Create/Train test classifier classifier = LibSvmClassifier( train_params={ '-t': 0, # linear kernel '-b': 1, # enable probability estimates '-c': 2, # SVM-C parameter C '-q': '' # quite mode }, normalize=None, # DO NOT normalize descriptors ) classifier.train({P1_LABEL: d_p1, P2_LABEL: d_p2}, d_neg) # Test classifier x = numpy.random.rand(N, DIM) x_p1 = x[x[:, 1] <= 0.30] x_p2 = x[(x[:, 1] >= 0.36) & (x[:, 1] <= 0.63)] x_neg = x[x[:, 1] >= 0.69] d_p1 = p.map(make_element, enumerate(x_p1, di)) di += len(d_p1) d_p2 = p.map(make_element, enumerate(x_p2, di)) di += len(d_p2) d_neg = p.map(make_element, enumerate(x_neg, di)) di += len(d_neg) for d in d_p1: c = classifier.classify(d, c_factory) ntools.assert_equal(c.max_label(), P1_LABEL, "Incorrect %s label: %s :: %s" % (P1_LABEL, d.vector(), c.get_classification())) for d in d_p2: c = classifier.classify(d, c_factory) ntools.assert_equal(c.max_label(), P2_LABEL, "Incorrect %s label: %s :: %s" % (P2_LABEL, d.vector(), c.get_classification())) for d in d_neg: c = classifier.classify(d, c_factory) ntools.assert_equal(c.max_label(), LibSvmClassifier.NEGATIVE_LABEL, "Incorrect %s label: %s :: %s" % (LibSvmClassifier.NEGATIVE_LABEL, d.vector(), c.get_classification())) # Closing resources p.close() p.join()
def test_simple_classification(self): """ Test libSVM classification functionality using random constructed data, training the y=0.5 split """ DIM = 2 N = 1000 POS_LABEL = 'positive' p = multiprocessing.pool.ThreadPool() d_factory = DescriptorElementFactory(DescriptorMemoryElement, {}) c_factory = ClassificationElementFactory(MemoryClassificationElement, {}) def make_element((i, v)): d = d_factory.new_descriptor('test', i) d.set_vector(v) return d # Constructing artificial descriptors x = numpy.random.rand(N, DIM) x_pos = x[x[:, 1] <= 0.45] x_neg = x[x[:, 1] >= 0.55] d_pos = p.map(make_element, enumerate(x_pos)) d_neg = p.map(make_element, enumerate(x_neg, start=N//2)) # Create/Train test classifier classifier = LibSvmClassifier( train_params={ '-t': 0, # linear kernel '-b': 1, # enable probability estimates '-c': 2, # SVM-C parameter C '-q': '', # quite mode }, normalize=None, # DO NOT normalize descriptors ) classifier.train({POS_LABEL: d_pos}, d_neg) # Test classifier x = numpy.random.rand(N, DIM) x_pos = x[x[:, 1] <= 0.45] x_neg = x[x[:, 1] >= 0.55] d_pos = p.map(make_element, enumerate(x_pos, N)) d_neg = p.map(make_element, enumerate(x_neg, N + N//2)) for d in d_pos: c = classifier.classify(d, c_factory) ntools.assert_equal(c.max_label(), POS_LABEL, "Found False positive: %s :: %s" % (d.vector(), c.get_classification())) for d in d_neg: c = classifier.classify(d, c_factory) ntools.assert_equal(c.max_label(), LibSvmClassifier.NEGATIVE_LABEL, "Found False negative: %s :: %s" % (d.vector(), c.get_classification())) # Closing resources p.close() p.join()
def classifier_kfold_validation(): args = cli_parser().parse_args() config = cli.utility_main_helper(default_config, args) log = logging.getLogger(__name__) # # Load configurations / Setup data # pr_enabled = config['pr_curves']['enabled'] pr_output_dir = config['pr_curves']['output_directory'] pr_file_prefix = config['pr_curves']['file_prefix'] or '' pr_show = config['pr_curves']['show'] roc_enabled = config['roc_curves']['enabled'] roc_output_dir = config['roc_curves']['output_directory'] roc_file_prefix = config['roc_curves']['file_prefix'] or '' roc_show = config['roc_curves']['show'] log.info("Initializing DescriptorSet (%s)", config['plugins']['descriptor_set']['type']) #: :type: smqtk.representation.DescriptorSet descriptor_set = from_config_dict(config['plugins']['descriptor_set'], DescriptorSet.get_impls()) log.info("Loading classifier configuration") #: :type: dict classifier_config = config['plugins']['supervised_classifier'] # Always use in-memory ClassificationElement since we are retraining the # classifier and don't want possible element caching #: :type: ClassificationElementFactory classification_factory = ClassificationElementFactory( MemoryClassificationElement, {}) log.info("Loading truth data") #: :type: list[str] uuids = [] #: :type: list[str] truth_labels = [] with open(config['cross_validation']['truth_labels']) as f: f_csv = csv.reader(f) for row in f_csv: uuids.append(row[0]) truth_labels.append(row[1]) #: :type: numpy.ndarray[str] uuids = numpy.array(uuids) #: :type: numpy.ndarray[str] truth_labels = numpy.array(truth_labels) # # Cross validation # kfolds = sklearn.model_selection.StratifiedKFold( n_splits=config['cross_validation']['num_folds'], shuffle=True, random_state=config['cross_validation']['random_seed'], ).split(numpy.zeros(len(truth_labels)), truth_labels) """ Truth and classification probability results for test data per fold. Format: { 0: { '<label>': { "truth": [...], # Parallel truth and classification "proba": [...], # probability values }, ... }, ... } """ fold_data: Dict[int, Any] = {} i = 0 for train, test in kfolds: log.info("Fold %d", i) log.info("-- %d training examples", len(train)) log.info("-- %d test examples", len(test)) fold_data[i] = {} log.info("-- creating classifier") classifier = cast( SupervisedClassifier, from_config_dict(classifier_config, SupervisedClassifier.get_impls())) log.info("-- gathering descriptors") pos_map: Dict[str, List[DescriptorElement]] = {} for idx in train: if truth_labels[idx] not in pos_map: pos_map[truth_labels[idx]] = [] pos_map[truth_labels[idx]].append( descriptor_set.get_descriptor(uuids[idx])) log.info("-- Training classifier") classifier.train(pos_map) log.info("-- Classifying test set") c_iter = classifier.classify_elements( (descriptor_set.get_descriptor(uuids[idx]) for idx in test), classification_factory, ) uuid2c = dict((c.uuid, c.get_classification()) for c in c_iter) log.info("-- Pairing truth and computed probabilities") # Only considering positive labels for t_label in pos_map: fold_data[i][t_label] = { "truth": [L == t_label for L in truth_labels[test]], "proba": [uuid2c[uuid][t_label] for uuid in uuids[test]] } i += 1 # # Curve generation # if pr_enabled: make_pr_curves(fold_data, pr_output_dir, pr_file_prefix, pr_show) if roc_enabled: make_roc_curves(fold_data, roc_output_dir, roc_file_prefix, roc_show)