예제 #1
0
  def test_invalid_get_labels(self):
    loader = StubLoader()
    schema = Schema({'v': Schema.LABEL})
    ds = Dataset(loader, schema, static=False)

    # get_labels returns generator; as generator will be evaluated
    # when actually iterating over it, pass it to list().
    self.assertRaises(RuntimeError, list, ds.get_labels())
예제 #2
0
    def test_invalid_get_labels(self):
        loader = StubLoader()
        schema = Schema({'v': Schema.LABEL})
        ds = Dataset(loader, schema, static=False)

        # get_labels returns generator; as generator will be evaluated
        # when actually iterating over it, pass it to list().
        self.assertRaises(RuntimeError, list, ds.get_labels())
예제 #3
0
 def test_simple(self):
     loader = StubLoader()
     schema = Schema({'v': Schema.LABEL})
     ds = Dataset(loader, schema)
     for (idx, (label, d)) in ds:
         self.assertEqual(unicode_t(idx + 1), label)
         self.assertEqual(0, len(d.string_values))
         self.assertEqual(0, len(d.num_values))
         self.assertEqual(0, len(d.binary_values))
     self.assertEqual(['1', '2', '3'], list(ds.get_labels()))
예제 #4
0
 def test_simple(self):
   loader = StubLoader()
   schema = Schema({'v': Schema.LABEL})
   ds = Dataset(loader, schema)
   for (idx, (label, d)) in ds:
     self.assertEqual(unicode_t(idx+1), label)
     self.assertEqual(0, len(d.string_values))
     self.assertEqual(0, len(d.num_values))
     self.assertEqual(0, len(d.binary_values))
   self.assertEqual(['1','2','3'], list(ds.get_labels()))
예제 #5
0
    def test_from_data(self):
        # load from array format
        ds = Dataset.from_data(
            [[10, 20, 30], [20, 10, 50], [40, 10, 30]],  # data
            [0, 1, 0],  # labels
            ['k1', 'k2', 'k3'],  # feature_names
            ['pos', 'neg'],  # label_names
        )

        expected_labels = ['pos', 'neg', 'pos']
        expected_k1s = [10, 20, 40]
        actual_labels = []
        actual_k1s = []
        for (idx, (label, d)) in ds:
            actual_labels.append(label)
            actual_k1s.append(dict(d.num_values)['k1'])

        self.assertEqual(expected_labels, actual_labels)
        self.assertEqual(expected_k1s, actual_k1s)

        # load from scipy.sparse format
        ds = Dataset.from_data(
            self._create_matrix(),  # data
            [0, 1, 0],  # labels
            ['k1', 'k2', 'k3'],  # feature_names
            ['pos', 'neg'],  # label_names
        )

        expected_labels = ['pos', 'neg', 'pos']
        expected_k1s = [1, None, 4]
        expected_k3s = [2, 3, 6]
        actual_labels = []
        actual_k1s = []
        actual_k3s = []
        for (idx, (label, d)) in ds:
            actual_labels.append(label)
            actual_k1s.append(dict(d.num_values).get('k1', None))
            actual_k3s.append(dict(d.num_values).get('k3', None))

        self.assertEqual(expected_labels, actual_labels)
        self.assertEqual(expected_k1s, actual_k1s)
        self.assertEqual(expected_k3s, actual_k3s)
예제 #6
0
  def test_from_array(self):
    ds = Dataset.from_array(
        [ [10,20,30], [20,10,50], [40,10,30] ], # data
        [ 0,          1,          0          ], # labels
        ['k1', 'k2', 'k3'],                     # feature_names
        ['pos', 'neg'],                         # label_names
    )

    expected_labels = ['pos', 'neg', 'pos']
    expected_k1s = [10, 20, 40]
    actual_labels = []
    actual_k1s = []
    for (idx, (label, d)) in ds:
      actual_labels.append(label)
      actual_k1s.append(dict(d.num_values)['k1'])

    self.assertEqual(expected_labels, actual_labels)
    self.assertEqual(expected_k1s, actual_k1s)
예제 #7
0
    def test_from_array_without_label(self):
        ds = Dataset.from_array(
            [[10, 20, 30], [20, 10, 50], [40, 10, 30]],  # data
            None,  # labels
            ['k1', 'k2', 'k3'],  # feature_names
            ['pos', 'neg'],  # label_names
        )

        expected_labels = [None, None, None]
        expected_k1s = [10, 20, 40]
        actual_labels = []
        actual_k1s = []
        for (idx, (label, d)) in ds:
            actual_labels.append(label)
            actual_k1s.append(dict(d.num_values)['k1'])

        self.assertEqual(expected_labels, actual_labels)
        self.assertEqual(expected_k1s, actual_k1s)
예제 #8
0
    def test_from_matrix(self):
        ds = Dataset.from_matrix(
            self._create_matrix(),  # data
            [0, 1, 0],  # labels
            ['k1', 'k2', 'k3'],  # feature_names
            ['pos', 'neg'],  # label_names
        )

        expected_labels = ['pos', 'neg', 'pos']
        expected_k1s = [1, None, 4]
        expected_k3s = [2, 3, 6]
        actual_labels = []
        actual_k1s = []
        actual_k3s = []
        for (idx, (label, d)) in ds:
            actual_labels.append(label)
            actual_k1s.append(dict(d.num_values).get('k1', None))
            actual_k3s.append(dict(d.num_values).get('k3', None))

        self.assertEqual(expected_labels, actual_labels)
        self.assertEqual(expected_k1s, actual_k1s)
        self.assertEqual(expected_k3s, actual_k3s)
예제 #9
0
  def test_from_matrix(self):
    ds = Dataset.from_matrix(
      self._create_matrix(),    # data
      [ 0, 1, 0 ],              # labels
      [ 'k1', 'k2', 'k3'],      # feature_names
      [ 'pos', 'neg'],          # label_names
    )

    expected_labels = ['pos', 'neg', 'pos']
    expected_k1s = [1,None,4]
    expected_k3s = [2,3,6]
    actual_labels = []
    actual_k1s = []
    actual_k3s = []
    for (idx, (label, d)) in ds:
      actual_labels.append(label)
      actual_k1s.append(dict(d.num_values).get('k1', None))
      actual_k3s.append(dict(d.num_values).get('k3', None))

    self.assertEqual(expected_labels, actual_labels)
    self.assertEqual(expected_k1s, actual_k1s)
    self.assertEqual(expected_k3s, actual_k3s)
le.fit(labels)
c = le.transform(y)

# scale dataset with (mean, variance) = (0, 1)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# calculate the domain
X_min = X.min(axis=0)
#X_min = np.ones(X.shape[1])
X_max = X.max(axis=0)
X0, X1 = np.meshgrid(np.linspace(X_min[0], X_max[0], meshsize),
                     np.linspace(X_min[1], X_max[1], meshsize))

# make training dataset
dataset = Dataset.from_array(X, y)
# make mesh dataset to plot decision surface
contourf_dataset = Dataset.from_array(np.c_[X0.ravel(), X1.ravel()])

# setup and run jubatus
config = Config(method=method,
                parameter={'regularization_weight': regularization_weight})
classifier = Classifier.run(config, port=port)

# construct classifier prediction models and dump model weights
for i, _ in enumerate(classifier.train(dataset)):
    model_name = 'decision_surface_{}'.format(i)
    classifier.save(name=model_name)

# prepare figure
fig, ax = plt.subplots()
예제 #11
0
 def test_get_labels(self):
     loader = StubLoader()
     schema = Schema({'v': Schema.LABEL})
     ds = Dataset(loader, schema)
     self.assertEqual(['1', '2', '3'], list(ds.get_labels()))
예제 #12
0
 def test_predict(self):
     loader = StubLoader()
     dataset = Dataset(loader)  # predict
     self.assertEqual(['v', 1.0], dataset[0][1].num_values[0])
예제 #13
0
from jubakit.classifier import Classifier, Schema, Dataset, Config
from jubakit.loader.csv import CSVLoader

# Load the shogun dataset.
train_loader = CSVLoader('shogun.train.csv')
test_loader = CSVLoader('shogun.test.csv')

# Define a Schema that defines types for each columns of the CSV file.
schema = Schema({
  'family_name': Schema.LABEL,
  'first_name': Schema.STRING,
})

# Create a Dataset.
train_dataset = Dataset(train_loader, schema).shuffle()
test_dataset = Dataset(test_loader, schema)

# Create a Classifier Service.
cfg = Config(
  method = 'PA',
  converter = {
    'string_rules': [{'key': 'first_name', 'type': 'unigram', 'sample_weight': 'bin', 'global_weight': 'bin'}]
  }
)
classifier = Classifier.run(cfg)

# Train the classifier.
for _ in classifier.train(train_dataset): pass

# Classify using the classifier.
for (idx, label, result) in classifier.classify(test_dataset):
예제 #14
0
from sklearn.datasets import load_svmlight_files
import sklearn.metrics

import jubakit
from jubakit.classifier import Classifier, Dataset, Config

# Load LIBSVM files.
# Note that these example files are not included in this repository.
# You can fetch them from: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#news20
print("Loading LIBSVM files...")
(train_X, train_y, test_X,
 test_y) = load_svmlight_files(['news20', 'news20.t'])

# Create a Train Dataset.
print("Creating train dataset...")
train_ds = Dataset.from_matrix(train_X, train_y)

# Create a Test Dataset
print("Creating test dataset...")
test_ds = Dataset.from_matrix(test_X, test_y)

# Create a Classifier Service
classifier = Classifier.run(Config())

# Train the classifier.
print("Training...")
for (idx, _) in classifier.train(train_ds):
    if idx % 1000 == 0:
        print("Training... ({0} %)".format(100 * idx / len(train_ds)))

# Test the classifier.
예제 #15
0
from sklearn.datasets import load_svmlight_files
import sklearn.metrics

import jubakit
from jubakit.classifier import Classifier, Dataset, Config

# Load LIBSVM files.
# Note that these example files are not included in this repository.
# You can fetch them from: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass.html#news20
print("Loading LIBSVM files...")
(train_X, train_y, test_X, test_y) = load_svmlight_files(['news20', 'news20.t'])

# Create a Train Dataset.
print("Creating train dataset...")
train_ds = Dataset.from_matrix(train_X, train_y)

# Create a Test Dataset
print("Creating test dataset...")
test_ds = Dataset.from_matrix(test_X, test_y)

# Create a Classifier Service
classifier = Classifier.run(Config())

# Train the classifier.
print("Training...")
for (idx, _) in classifier.train(train_ds):
  if idx % 1000 == 0:
    print("Training... ({0} %)".format(100 * idx / len(train_ds)))

# Test the classifier.
예제 #16
0
# switch StratifiedKFold API
sklearn_version = int(sklearn.__version__.split('.')[1])
if sklearn_version < 18:
    from sklearn.cross_validation import StratifiedKFold
else:
    from sklearn.model_selection import StratifiedKFold


# Load built-in `iris` dataset from scikit-learn.
iris = sklearn.datasets.load_iris()

# Convert it into jubakit Dataset.
#dataset = Dataset.from_array(iris.data, iris.target)
# ... or, optionally you can assign feature/label names to improve human-readbility.
dataset = Dataset.from_array(iris.data, iris.target, iris.feature_names, iris.target_names)

# Shuffle the dataset, as the dataset is sorted by label.
dataset = dataset.shuffle()

# Create a Classifier Service.
# Classifier process starts using a default configuration.
classifier = Classifier.run(Config())

# Prepare arrays to keep true/predicted labels to display a report later.
true_labels = []
predicted_labels = []

# Run stratified K-fold validation.
labels = list(dataset.get_labels())
if sklearn_version < 18:
예제 #17
0
===================================================

In this example we show classification using Digits dataset.
"""

import sklearn.datasets
import sklearn.metrics

import jubakit
from jubakit.classifier import Classifier, Dataset, Config

# Load the digits dataset.
digits = sklearn.datasets.load_digits()

# Create a Dataset.
dataset = Dataset.from_array(digits.data, digits.target)
n_samples = len(dataset)
n_train_samples = int(n_samples / 2)

# Create a Classifier Service
cfg = Config(method='AROW', parameter={'regularization_weight': 0.1})
classifier = Classifier.run(cfg)

print("Started Service: {0}".format(classifier))

# Train the classifier using the first half of the dataset.
train_ds = dataset[:n_train_samples]
print("Training...: {0}".format(train_ds))
for _ in classifier.train(train_ds):
    pass
예제 #18
0
    n_redundant=2,
    n_repeated=0,
    n_classes=2,
    n_clusters_per_class=2,
    weights=None,
    flip_y=0.01,
    class_sep=1.0,
    hypercube=True,
    shift=0.0,
    scale=1.0,
    shuffle=True,
    random_state=0,  # fixed seed
)

# Convert arrays into jubakit Dataset.
dataset = Dataset.from_array(X, y)

# Try finding the best classifier parameter.
param2metrics = {}
for method in ['AROW', 'NHERD', 'CW']:
    for rw in [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]:
        print('Running ({0} / regularization_weight = {1})...'.format(
            method, rw))

        # Create a config data structure.
        jubatus_config = Config(method=method,
                                parameter={'regularization_weight': rw})

        # It is equivalent to:
        #jubatus_config = Config.default()
        #jubatus_config['method'] = method
예제 #19
0
 def test_get_labels(self):
   loader = StubLoader()
   schema = Schema({'v': Schema.LABEL})
   ds = Dataset(loader, schema)
   self.assertEqual(['1', '2', '3'], list(ds.get_labels()))
예제 #20
0
from jubakit.classifier import Classifier, Schema, Dataset, Config
from jubakit.loader.csv import CSVLoader

# Load the shogun dataset.
train_loader = CSVLoader('shogun.train.csv')
test_loader = CSVLoader('shogun.test.csv')

# Define a Schema that defines types for each columns of the CSV file.
schema = Schema({
    'family_name': Schema.LABEL,
    'first_name': Schema.STRING,
})

# Create a Dataset.
train_dataset = Dataset(train_loader, schema).shuffle()
test_dataset = Dataset(test_loader, schema)

# Create a Classifier Service.
cfg = Config(method='PA',
             converter={
                 'string_rules': [{
                     'key': 'first_name',
                     'type': 'unigram',
                     'sample_weight': 'bin',
                     'global_weight': 'bin'
                 }]
             })
classifier = Classifier.run(cfg)

# Train the classifier.
예제 #21
0
===================================================

In this example we show classification using Digits dataset.
"""

import sklearn.datasets
import sklearn.metrics

import jubakit
from jubakit.classifier import Classifier, Dataset, Config

# Load the digits dataset.
digits = sklearn.datasets.load_digits()

# Create a Dataset.
dataset = Dataset.from_array(digits.data, digits.target)
n_samples = len(dataset)
n_train_samples = int(n_samples / 2)

# Create a Classifier Service
cfg = Config(method='AROW', parameter={'regularization_weight': 0.1})
classifier = Classifier.run(cfg)

print("Started Service: {0}".format(classifier))

# Train the classifier using the first half of the dataset.
train_ds = dataset[:n_train_samples]
print("Training...: {0}".format(train_ds))
for _ in classifier.train(train_ds): pass

# Test the classifier using the last half of the dataset.
예제 #22
0
# what's going on in jubakit.
jubakit.logger.setup_logger(jubakit.logger.INFO)

# Load a CSV file.
loader = CSVLoader('iris.csv')

# Define a Schema that defines types for each columns of the CSV file.
schema = Schema({
  'Species': Schema.LABEL,
}, Schema.NUMBER)

# Display Schema
print('Schema: {0}'.format(schema))

# Create a Dataset.
dataset = Dataset(loader, schema).shuffle()
n_samples = len(dataset)
n_train_samples = int(n_samples / 2)

# Create a Classifier configuration.
cfg = Config()

# Bulk train-test the classifier.
result = Classifier.train_and_classify(
  cfg,
  dataset[:n_train_samples],
  dataset[n_train_samples:],
  sklearn.metrics.classification_report
)

print('---- Classification Report -----------------------------------')
예제 #23
0
  n_redundant=2,
  n_repeated=0,
  n_classes=2,
  n_clusters_per_class=2,
  weights=None,
  flip_y=0.01,
  class_sep=1.0,
  hypercube=True,
  shift=0.0,
  scale=1.0,
  shuffle=True,
  random_state=0,  # fixed seed
)

# Convert arrays into jubakit Dataset.
dataset = Dataset.from_array(X, y)

# Try finding the best classifier parameter.
param2metrics = {}
for method in ['AROW', 'NHERD', 'CW']:
  for rw in [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]:
    print('Running ({0} / regularization_weight = {1})...'.format(method, rw))

    # Create a config data structure.
    jubatus_config = Config(method=method, parameter={'regularization_weight': rw})

    # It is equivalent to:
    #jubatus_config = Config.default()
    #jubatus_config['method'] = method
    #jubatus_config['parameter']['regularization_weight'] = rw
예제 #24
0
from jubakit.classifier import Classifier, Dataset, Config

# switch StratifiedKFold API
sklearn_version = int(sklearn.__version__.split('.')[1])
if sklearn_version < 18:
    from sklearn.cross_validation import StratifiedKFold
else:
    from sklearn.model_selection import StratifiedKFold

# Load built-in `iris` dataset from scikit-learn.
iris = sklearn.datasets.load_iris()

# Convert it into jubakit Dataset.
#dataset = Dataset.from_array(iris.data, iris.target)
# ... or, optionally you can assign feature/label names to improve human-readbility.
dataset = Dataset.from_array(iris.data, iris.target, iris.feature_names,
                             iris.target_names)

# Shuffle the dataset, as the dataset is sorted by label.
dataset = dataset.shuffle()

# Create a Classifier Service.
# Classifier process starts using a default configuration.
classifier = Classifier.run(Config())

# Prepare arrays to keep true/predicted labels to display a report later.
true_labels = []
predicted_labels = []

# Run stratified K-fold validation.
labels = list(dataset.get_labels())
if sklearn_version < 18:
예제 #25
0
        '.user.lang': Schema.STRING,
        '.user.description': Schema.STRING,
    }, Schema.IGNORE)

# Create a Classifier Service.
classifier = Classifier.run(Config())

# Number of tweets used for training.
n_train = 1000

print('---- Train: {0} tweets -------------------------------------'.format(
    n_train))

# Train the classifier using tweets from Twitter stream.
trained_labels = set()
dataset = Dataset(get_loader(), schema)
for (idx, label) in classifier.train(dataset):
    if idx == n_train: break

    trained_labels.add(label)
    text_summary = dataset.get(idx)['.text'].replace('\n', '')
    print('Train[{0}]: language {1}  >> {2}'.format(idx, label, text_summary))

print('Languages Trained: {0}'.format(str(trained_labels)))

print('---- Prediction (Ctrl-C to stop) -------------------------------------')

try:
    # Classify tweets using the classifier.
    (y_true, y_pred) = ([], [])
    dataset = Dataset(get_loader(), schema)