예제 #1
0
def jubatus_config(params):
    """
  convert hyperopt config to jubatus config
  """
    if params['classifier_type'] == 'LinearClassifier':
        config = Config(method=params['linear_method'],
                        parameter={
                            'regularization_weight':
                            params['regularization_weight']
                        })

    elif params['classifier_type'] == 'NearestNeighbor':
        config = Config(method='NN',
                        parameter={
                            'method':
                            params['nn_method'],
                            'nearest_neighbor_num':
                            int(params['nearest_neighbor_num']),
                            'local_sensitivity':
                            params['local_sensitivity'],
                            'parameter': {
                                'hash_num': int(params['hash_num'])
                            }
                        })

    else:
        raise NotImplementedError()

    return config
예제 #2
0
 def test_method_param(self):
     self.assertTrue('parameter' not in Config(method='PA'))
     self.assertTrue('regularization_weight' in Config(
         method='PA1')['parameter'])
     self.assertTrue('nearest_neighbor_num' in Config(
         method='NN')['parameter'])
     self.assertTrue('nearest_neighbor_num' in Config(
         method='cosine')['parameter'])
     self.assertTrue('nearest_neighbor_num' in Config(
         method='euclidean')['parameter'])
예제 #3
0
  'Species': Schema.LABEL,
  'Sepal.Length': Schema.NUMBER,
  'Sepal.Width': Schema.NUMBER,
  'Petal.Length': Schema.NUMBER,
  'Petal.Width': Schema.NUMBER,
})

# Create a Dataset, which is an abstract representation of a set of data
# that can be fed to Services like Classifier.  `shuffle()` returns a new
# Dataset whose order of data is shuffled.  Note that datasets are immutable
# objects.
dataset = Dataset(loader, schema).shuffle()

# Create a Classifier Service.
# Classifier process starts using a default configuration.
cfg = Config.default()
classifier = Classifier.run(cfg)

# You can also connect to an existing service instead.
#classifier = Classifier('127.0.0.1', 9199)

# Train the classifier with every data in the dataset.
for (idx, label) in classifier.train(dataset):
  # You can peek the datum being trained.
  print("Train: {0}".format(dataset[idx]))

# Save the trained model file.
print("Saving model file...")
classifier.save('example_snapshot')

# Classify using the same dataset.
예제 #4
0
# Define a Schema that defines types for each columns of the CSV file.
schema = Schema({
    'family_name': Schema.LABEL,
    'first_name': Schema.STRING,
})

# Create a Dataset.
train_dataset = Dataset(train_loader, schema).shuffle()
test_dataset = Dataset(test_loader, schema)

# Create a Classifier Service.
cfg = Config(method='PA',
             converter={
                 'string_rules': [{
                     'key': 'first_name',
                     'type': 'unigram',
                     'sample_weight': 'bin',
                     'global_weight': 'bin'
                 }]
             })
classifier = Classifier.run(cfg)

# Train the classifier.
for _ in classifier.train(train_dataset):
    pass

# Classify using the classifier.
for (idx, label, result) in classifier.classify(test_dataset):
    true_family_name = label
    pred_family_name = result[0][0]
    first_name = test_dataset.get(idx)['first_name']
예제 #5
0
 def setUp(self):
     self._service = Classifier.run(Config())
예제 #6
0
import sklearn.datasets
import sklearn.metrics

import jubakit
from jubakit.classifier import Classifier, Dataset, Config

# Load the digits dataset.
digits = sklearn.datasets.load_digits()

# Create a Dataset.
dataset = Dataset.from_array(digits.data, digits.target)
n_samples = len(dataset)
n_train_samples = int(n_samples / 2)

# Create a Classifier Service
cfg = Config(method='AROW', parameter={'regularization_weight': 0.1})
classifier = Classifier.run(cfg)

print("Started Service: {0}".format(classifier))

# Train the classifier using the first half of the dataset.
train_ds = dataset[:n_train_samples]
print("Training...: {0}".format(train_ds))
for _ in classifier.train(train_ds):
    pass

# Test the classifier using the last half of the dataset.
test_ds = dataset[n_train_samples:]
print("Testing...: {0}".format(test_ds))
y_true = []
y_pred = []
예제 #7
0
 def test_default(self):
     config = Config.default()
     self.assertEqual('AROW', config['method'])
X = scaler.fit_transform(X)

# calculate the domain
X_min = X.min(axis=0)
#X_min = np.ones(X.shape[1])
X_max = X.max(axis=0)
X0, X1 = np.meshgrid(np.linspace(X_min[0], X_max[0], meshsize),
                     np.linspace(X_min[1], X_max[1], meshsize))

# make training dataset
dataset = Dataset.from_array(X, y)
# make mesh dataset to plot decision surface
contourf_dataset = Dataset.from_array(np.c_[X0.ravel(), X1.ravel()])

# setup and run jubatus
config = Config(method=method,
                parameter={'regularization_weight': regularization_weight})
classifier = Classifier.run(config, port=port)

# construct classifier prediction models and dump model weights
for i, _ in enumerate(classifier.train(dataset)):
    model_name = 'decision_surface_{}'.format(i)
    classifier.save(name=model_name)

# prepare figure
fig, ax = plt.subplots()


def draw_decision_surface(i):
    midx = int(i / 2)
    sidx = int(i / 2) + (i % 2)
    # load jubatus prediction model
예제 #9
0
 def test_simple(self):
     config = Config()
     self.assertEqual('AROW', config['method'])
예제 #10
0
 def test_default(self):
   config = Config.default()
   self.assertEqual('AROW', config['method'])
예제 #11
0
 def test_methods(self):
   config = Config()
   self.assertTrue(isinstance(config.methods(), list))
예제 #12
0
    shuffle=True,
    random_state=0,  # fixed seed
)

# Convert arrays into jubakit Dataset.
dataset = Dataset.from_array(X, y)

# Try finding the best classifier parameter.
param2metrics = {}
for method in ['AROW', 'NHERD', 'CW']:
    for rw in [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]:
        print('Running ({0} / regularization_weight = {1})...'.format(
            method, rw))

        # Create a config data structure.
        jubatus_config = Config(method=method,
                                parameter={'regularization_weight': rw})

        # It is equivalent to:
        #jubatus_config = Config.default()
        #jubatus_config['method'] = method
        #jubatus_config['parameter']['regularization_weight'] = rw

        # Launch Jubatus server using the specified configuration.
        classifier = Classifier.run(jubatus_config)

        # Train with the dataset.
        for _ in classifier.train(dataset):
            pass

        # Classify with the same dataset.
        y_true = []
예제 #13
0
 def test_embedded(self):
   classifier = Classifier.run(Config(), embedded=True)
예제 #14
0
# Load a CSV file.
loader = CSVLoader('iris.csv')

# Define a Schema that defines types for each columns of the CSV file.
schema = Schema({
  'Species': Schema.LABEL,
}, Schema.NUMBER)

# Display Schema
print('Schema: {0}'.format(schema))

# Create a Dataset.
dataset = Dataset(loader, schema).shuffle()
n_samples = len(dataset)
n_train_samples = int(n_samples / 2)

# Create a Classifier configuration.
cfg = Config()

# Bulk train-test the classifier.
result = Classifier.train_and_classify(
  cfg,
  dataset[:n_train_samples],
  dataset[n_train_samples:],
  sklearn.metrics.classification_report
)

print('---- Classification Report -----------------------------------')
print(result)
예제 #15
0
 def test_methods(self):
     config = Config()
     self.assertTrue(isinstance(config.methods(), list))
예제 #16
0
# Load built-in `iris` dataset from scikit-learn.
iris = sklearn.datasets.load_iris()

# Convert it into jubakit Dataset.
#dataset = Dataset.from_array(iris.data, iris.target)
# ... or, optionally you can assign feature/label names to improve human-readbility.
dataset = Dataset.from_array(iris.data, iris.target, iris.feature_names,
                             iris.target_names)

# Shuffle the dataset, as the dataset is sorted by label.
dataset = dataset.shuffle()

# Create a Classifier Service.
# Classifier process starts using a default configuration.
classifier = Classifier.run(Config())

# Prepare arrays to keep true/predicted labels to display a report later.
true_labels = []
predicted_labels = []

# Run stratified K-fold validation.
labels = list(dataset.get_labels())
if sklearn_version < 18:
    train_test_indices = StratifiedKFold(labels, n_folds=10)
else:
    skf = StratifiedKFold(n_splits=10)
    train_test_indices = skf.split(labels, labels)

for train_idx, test_idx in train_test_indices:
    # Clear the classifier (call `clear` RPC).
예제 #17
0
 def setUp(self):
     self._service = Classifier.run(Config())
     self._sh = self._service._shell()