def jubatus_config(params): """ convert hyperopt config to jubatus config """ if params['classifier_type'] == 'LinearClassifier': config = Config(method=params['linear_method'], parameter={ 'regularization_weight': params['regularization_weight'] }) elif params['classifier_type'] == 'NearestNeighbor': config = Config(method='NN', parameter={ 'method': params['nn_method'], 'nearest_neighbor_num': int(params['nearest_neighbor_num']), 'local_sensitivity': params['local_sensitivity'], 'parameter': { 'hash_num': int(params['hash_num']) } }) else: raise NotImplementedError() return config
def test_method_param(self): self.assertTrue('parameter' not in Config(method='PA')) self.assertTrue('regularization_weight' in Config( method='PA1')['parameter']) self.assertTrue('nearest_neighbor_num' in Config( method='NN')['parameter']) self.assertTrue('nearest_neighbor_num' in Config( method='cosine')['parameter']) self.assertTrue('nearest_neighbor_num' in Config( method='euclidean')['parameter'])
'Species': Schema.LABEL, 'Sepal.Length': Schema.NUMBER, 'Sepal.Width': Schema.NUMBER, 'Petal.Length': Schema.NUMBER, 'Petal.Width': Schema.NUMBER, }) # Create a Dataset, which is an abstract representation of a set of data # that can be fed to Services like Classifier. `shuffle()` returns a new # Dataset whose order of data is shuffled. Note that datasets are immutable # objects. dataset = Dataset(loader, schema).shuffle() # Create a Classifier Service. # Classifier process starts using a default configuration. cfg = Config.default() classifier = Classifier.run(cfg) # You can also connect to an existing service instead. #classifier = Classifier('127.0.0.1', 9199) # Train the classifier with every data in the dataset. for (idx, label) in classifier.train(dataset): # You can peek the datum being trained. print("Train: {0}".format(dataset[idx])) # Save the trained model file. print("Saving model file...") classifier.save('example_snapshot') # Classify using the same dataset.
# Define a Schema that defines types for each columns of the CSV file. schema = Schema({ 'family_name': Schema.LABEL, 'first_name': Schema.STRING, }) # Create a Dataset. train_dataset = Dataset(train_loader, schema).shuffle() test_dataset = Dataset(test_loader, schema) # Create a Classifier Service. cfg = Config(method='PA', converter={ 'string_rules': [{ 'key': 'first_name', 'type': 'unigram', 'sample_weight': 'bin', 'global_weight': 'bin' }] }) classifier = Classifier.run(cfg) # Train the classifier. for _ in classifier.train(train_dataset): pass # Classify using the classifier. for (idx, label, result) in classifier.classify(test_dataset): true_family_name = label pred_family_name = result[0][0] first_name = test_dataset.get(idx)['first_name']
def setUp(self): self._service = Classifier.run(Config())
import sklearn.datasets import sklearn.metrics import jubakit from jubakit.classifier import Classifier, Dataset, Config # Load the digits dataset. digits = sklearn.datasets.load_digits() # Create a Dataset. dataset = Dataset.from_array(digits.data, digits.target) n_samples = len(dataset) n_train_samples = int(n_samples / 2) # Create a Classifier Service cfg = Config(method='AROW', parameter={'regularization_weight': 0.1}) classifier = Classifier.run(cfg) print("Started Service: {0}".format(classifier)) # Train the classifier using the first half of the dataset. train_ds = dataset[:n_train_samples] print("Training...: {0}".format(train_ds)) for _ in classifier.train(train_ds): pass # Test the classifier using the last half of the dataset. test_ds = dataset[n_train_samples:] print("Testing...: {0}".format(test_ds)) y_true = [] y_pred = []
def test_default(self): config = Config.default() self.assertEqual('AROW', config['method'])
X = scaler.fit_transform(X) # calculate the domain X_min = X.min(axis=0) #X_min = np.ones(X.shape[1]) X_max = X.max(axis=0) X0, X1 = np.meshgrid(np.linspace(X_min[0], X_max[0], meshsize), np.linspace(X_min[1], X_max[1], meshsize)) # make training dataset dataset = Dataset.from_array(X, y) # make mesh dataset to plot decision surface contourf_dataset = Dataset.from_array(np.c_[X0.ravel(), X1.ravel()]) # setup and run jubatus config = Config(method=method, parameter={'regularization_weight': regularization_weight}) classifier = Classifier.run(config, port=port) # construct classifier prediction models and dump model weights for i, _ in enumerate(classifier.train(dataset)): model_name = 'decision_surface_{}'.format(i) classifier.save(name=model_name) # prepare figure fig, ax = plt.subplots() def draw_decision_surface(i): midx = int(i / 2) sidx = int(i / 2) + (i % 2) # load jubatus prediction model
def test_simple(self): config = Config() self.assertEqual('AROW', config['method'])
def test_methods(self): config = Config() self.assertTrue(isinstance(config.methods(), list))
shuffle=True, random_state=0, # fixed seed ) # Convert arrays into jubakit Dataset. dataset = Dataset.from_array(X, y) # Try finding the best classifier parameter. param2metrics = {} for method in ['AROW', 'NHERD', 'CW']: for rw in [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]: print('Running ({0} / regularization_weight = {1})...'.format( method, rw)) # Create a config data structure. jubatus_config = Config(method=method, parameter={'regularization_weight': rw}) # It is equivalent to: #jubatus_config = Config.default() #jubatus_config['method'] = method #jubatus_config['parameter']['regularization_weight'] = rw # Launch Jubatus server using the specified configuration. classifier = Classifier.run(jubatus_config) # Train with the dataset. for _ in classifier.train(dataset): pass # Classify with the same dataset. y_true = []
def test_embedded(self): classifier = Classifier.run(Config(), embedded=True)
# Load a CSV file. loader = CSVLoader('iris.csv') # Define a Schema that defines types for each columns of the CSV file. schema = Schema({ 'Species': Schema.LABEL, }, Schema.NUMBER) # Display Schema print('Schema: {0}'.format(schema)) # Create a Dataset. dataset = Dataset(loader, schema).shuffle() n_samples = len(dataset) n_train_samples = int(n_samples / 2) # Create a Classifier configuration. cfg = Config() # Bulk train-test the classifier. result = Classifier.train_and_classify( cfg, dataset[:n_train_samples], dataset[n_train_samples:], sklearn.metrics.classification_report ) print('---- Classification Report -----------------------------------') print(result)
# Load built-in `iris` dataset from scikit-learn. iris = sklearn.datasets.load_iris() # Convert it into jubakit Dataset. #dataset = Dataset.from_array(iris.data, iris.target) # ... or, optionally you can assign feature/label names to improve human-readbility. dataset = Dataset.from_array(iris.data, iris.target, iris.feature_names, iris.target_names) # Shuffle the dataset, as the dataset is sorted by label. dataset = dataset.shuffle() # Create a Classifier Service. # Classifier process starts using a default configuration. classifier = Classifier.run(Config()) # Prepare arrays to keep true/predicted labels to display a report later. true_labels = [] predicted_labels = [] # Run stratified K-fold validation. labels = list(dataset.get_labels()) if sklearn_version < 18: train_test_indices = StratifiedKFold(labels, n_folds=10) else: skf = StratifiedKFold(n_splits=10) train_test_indices = skf.split(labels, labels) for train_idx, test_idx in train_test_indices: # Clear the classifier (call `clear` RPC).
def setUp(self): self._service = Classifier.run(Config()) self._sh = self._service._shell()