Пример #1
0
 def test_cp932(self):
     with TempFile() as f:
         f.write("テスト1,テスト2".encode('cp932'))
         f.flush()
         loader = CSVLoader(f.name, None, 'cp932')
         for row in loader:
             self.assertEqual('テスト1', row['c0'])
             self.assertEqual('テスト2', row['c1'])
Пример #2
0
 def test_restkey(self):
   with TempFile() as f:
     f.write("テスト1,テスト2\nテスト1,テスト2".encode('utf-8'))
     f.flush()
     loader = CSVLoader(f.name, ['c1'], 'utf-8', restkey='garbage')
     for row in loader:
       self.assertEqual('テスト1', row['c1'])
       self.assertEqual(['テスト2'], row['garbage'])
Пример #3
0
 def test_restval(self):
   with TempFile() as f:
     f.write("テスト1,テスト2\nテスト1,テスト2".encode('utf-8'))
     f.flush()
     loader = CSVLoader(f.name, ['c1', 'c2', 'c3'], 'utf-8', restval='<blank>')
     for row in loader:
       self.assertEqual('テスト1', row['c1'])
       self.assertEqual('テスト2', row['c2'])
       self.assertEqual('<blank>', row['c3'])
Пример #4
0
 def test_unicode_separator(self):
   with TempFile() as f:
     f.write("v1★v2\ns1★s2\n".encode('utf-8'))
     f.flush()
     loader = CSVLoader(f.name, delimiter='★')
     lines = 0
     for row in loader:
       lines += 1
       self.assertEqual('s1', row['v1'])
       self.assertEqual('s2', row['v2'])
     self.assertEqual(1, lines)
Пример #5
0
 def test_cp932_seq_fieldnames(self):
   with TempFile() as f:
     f.write("テスト1,テスト2\nテスト1,テスト2".encode('cp932'))
     f.flush()
     # assign sequential field names
     loader = CSVLoader(f.name, False, 'cp932', delimiter=',')
     lines = 0
     for row in loader:
       lines += 1
       self.assertEqual('テスト1', row['c0'])
       self.assertEqual('テスト2', row['c1'])
     self.assertEqual(2, lines)
Пример #6
0
 def test_cp932(self):
   with TempFile() as f:
     f.write("列1,列2\nテスト1,テスト2\n".encode('cp932'))
     f.flush()
     # predict field names from 1st row
     loader = CSVLoader(f.name, None, 'cp932', delimiter=',')
     lines = 0
     for row in loader:
       lines += 1
       self.assertEqual('テスト1', row['列1'])
       self.assertEqual('テスト2', row['列2'])
     self.assertEqual(1, lines)
Пример #7
0
 def test_cp932_manual_fieldnames(self):
   with TempFile() as f:
     f.write("テスト1,テスト2\nテスト1,テスト2".encode('cp932'))
     f.flush()
     # assign field names statically
     loader = CSVLoader(f.name, ['列1', '列2'], 'cp932', delimiter=',')
     lines = 0
     for row in loader:
       lines += 1
       self.assertEqual('テスト1', row['列1'])
       self.assertEqual('テスト2', row['列2'])
     self.assertEqual(2, lines)
Пример #8
0
 def test_simple(self):
     with TempFile() as f:
         f.write("k1,\"k2\",k3\n1,2,3\n4,5,6".encode('utf-8'))
         f.flush()
         loader = CSVLoader(f.name)
         for row in loader:
             self.assertEqual(set(['k1', 'k2', 'k3']), set(row.keys()))
             if row['k1'] == '1':
                 self.assertEqual('2', row['k2'])
                 self.assertEqual('3', row['k3'])
             elif row['k1'] == '4':
                 self.assertEqual('5', row['k2'])
                 self.assertEqual('6', row['k3'])
             else:
                 self.fail('unexpected row')
Пример #9
0
 def test_noheader(self):
     with TempFile() as f:
         f.write("1,\"2\",3\n\"4\",5,\"6\"".encode('utf-8'))
         f.flush()
         loader = CSVLoader(f.name, False)
         for row in loader:
             self.assertEqual(set(['c0', 'c1', 'c2']), set(row.keys()))
             if row['c0'] == '1':
                 self.assertEqual('2', row['c1'])
                 self.assertEqual('3', row['c2'])
             elif row['c0'] == '4':
                 self.assertEqual('5', row['c1'])
                 self.assertEqual('6', row['c2'])
             else:
                 self.fail('unexpected row')
Пример #10
0
 def test_guess_header(self):
   with TempFile() as f:
     f.write("k1|k2|k3\n1|2|3".encode())
     f.flush()
     loader = CSVLoader(f.name, fieldnames=True, delimiter='|')
     self.assertEqual([{'k1': '1', 'k2': '2', 'k3': '3'}], list(loader))
Пример #11
0
This example uses bulk train-test method of Classifier.
"""

import sklearn.metrics

from jubakit.classifier import Classifier, Schema, Dataset, Config
from jubakit.loader.csv import CSVLoader
import jubakit.logger

# In this example, we enable logging mechanism to show you
# what's going on in jubakit.
jubakit.logger.setup_logger(jubakit.logger.INFO)

# Load a CSV file.
loader = CSVLoader('iris.csv')

# Define a Schema that defines types for each columns of the CSV file.
schema = Schema({
  'Species': Schema.LABEL,
}, Schema.NUMBER)

# Display Schema
print('Schema: {0}'.format(schema))

# Create a Dataset.
dataset = Dataset(loader, schema).shuffle()
n_samples = len(dataset)
n_train_samples = int(n_samples / 2)

# Create a Classifier configuration.
Пример #12
0
We use AAAI 2014 Accepted Papers Data Set.

    Lichman, M. (2013).
    UCI Machine Learning Repository [http://archive.ics.uci.edu/ml].
    Irvine, CA: University of California, School of Information and
    Computer Science.

Please download csv file from the url and rename it "aaai.csv"
https://archive.ics.uci.edu/ml/datasets/AAAI+2014+Accepted+Papers
"""

from jubakit.nearest_neighbor import NearestNeighbor, Schema, Dataset, Config
from jubakit.loader.csv import CSVLoader

# Load a CSV file.
loader = CSVLoader('aaai.csv')

# Define a Schema that defines types for each columns of the CSV file.
# In this example, we use "abstract" and "topics" to calculate neighbor scores.
schema = Schema(
    {
        'title': Schema.ID,
        'abstract': Schema.STRING,
        'keyword': Schema.STRING
    }, Schema.IGNORE)
print('Schema:', schema)

# Create a Dataset.
dataset = Dataset(loader, schema)

# Create a nearest neighbor configuration.
Пример #13
0
# -*- coding: utf-8 -*-

from __future__ import absolute_import, division, print_function, unicode_literals
"""
Using Classifier and String Features
========================================

This is a famous `shogun` classifier example that predicts family name
of Shogun from his first name.
"""

from jubakit.classifier import Classifier, Schema, Dataset, Config
from jubakit.loader.csv import CSVLoader

# Load the shogun dataset.
train_loader = CSVLoader('shogun.train.csv')
test_loader = CSVLoader('shogun.test.csv')

# Define a Schema that defines types for each columns of the CSV file.
schema = Schema({
    'family_name': Schema.LABEL,
    'first_name': Schema.STRING,
})

# Create a Dataset.
train_dataset = Dataset(train_loader, schema).shuffle()
test_dataset = Dataset(test_loader, schema)

# Create a Classifier Service.
cfg = Config(method='PA',
             converter={
Пример #14
0
# -*- coding: utf-8 -*-

from __future__ import absolute_import, division, print_function, unicode_literals
"""
Using Weight Service
====================

This example illustrates how to use Weight engine to debug fv_converter
behavior (i.e. `converter` section of the config file).
"""

from jubakit.weight import Weight, Schema, Dataset, Config
from jubakit.loader.csv import CSVLoader

# Load a CSV file.
loader = CSVLoader('shogun.train.csv')

# Create a Dataset; schema will be auto-predicted.
dataset = Dataset(loader)

# Create a Weight Service.
cfg = Config()
weight = Weight.run(cfg)

# Show extracted features.  As we use `update` method, weights are
# updated incrementally.
print('==== Features (online TF-IDF) ========================')
for (idx, result) in weight.update(dataset):
    print('Raw Data:')
    print('\tfamily_name: {0}'.format(dataset.get(idx)['family_name']))
    print('\tfirst_name: {0}'.format(dataset.get(idx)['first_name']))
Пример #15
0
# -*- coding: utf-8 -*-

from __future__ import absolute_import, division, print_function, unicode_literals
"""
Using Clustering
========================================

This is a simple example that illustrates Clustering service usage.

"""

from jubakit.clustering import Clustering, Schema, Dataset, Config
from jubakit.loader.csv import CSVLoader

# Load a CSV file.
loader = CSVLoader('blobs.csv')

# Define a Schema that defines types for each columns of the CSV file.
schema = Schema({
    'cluster': Schema.ID,
}, Schema.NUMBER)

# Create a Dataset.
dataset = Dataset(loader, schema)

# Create an Clustering Service.
cfg = Config(method='kmeans')
clustering = Clustering.run(cfg)

# Update the Clustering model.
for (idx, row_id, result) in clustering.push(dataset):
Пример #16
0
========================================

This is a simple example that illustrates:

* How to load CSV files and convert it into Jubakit dataset.
* Register keywords to the burst client using the keyword dataset.
* Add documents to the burst client using the document dataset.
* Getting burst result.
"""

from jubakit.burst import KeywordSchema, KeywordDataset
from jubakit.burst import DocumentSchema, DocumentDataset
from jubakit.burst import Burst, Config
from jubakit.loader.csv import CSVLoader

keyword_loader = CSVLoader('burst_keywords.csv')
keyword_schema = KeywordSchema({
    'keyword': KeywordSchema.KEYWORD,
    'scaling': KeywordSchema.SCALING,
    'gamma': KeywordSchema.GAMMA
})
keyword_dataset = KeywordDataset(keyword_loader, keyword_schema)

document_loader = CSVLoader('burst_documents.csv')
document_schema = DocumentSchema({
    'position': DocumentSchema.POSITION,
    'text': DocumentSchema.TEXT
})
document_dataset = DocumentDataset(document_loader, document_schema)

burst = Burst.run(Config())
Пример #17
0
Using Regression and CSV file
==================================================

This is a simple example that illustrates:

* How to load CSV files and convert int into Jubakit dataset.
* Training the regression using the dataset.
* Getting regression result.
"""

import numpy as np
from jubakit.regression import Regression, Schema, Dataset, Config
from jubakit.loader.csv import CSVLoader

# Load a CSV file.
loader = CSVLoader('wine.csv')

# Define a Schema that defines types for each columns of the CSV file.
schema = Schema({
    'quality': Schema.TARGET,
}, Schema.NUMBER)

# Create a Dataset
dataset = Dataset(loader, schema).shuffle()
n_samples = len(dataset)
n_train_samples = int(n_samples * 0.75)

# Create a Regression Service
cfg = Config.default()
regression = Regression.run(cfg)
Пример #18
0
# -*- coding: utf-8 -*-

from __future__ import absolute_import, division, print_function, unicode_literals
"""
Using Recommender
========================================

This is a simple example that illustrates Recommender service usage.

"""

from jubakit.recommender import Recommender, Schema, Dataset, Config
from jubakit.loader.csv import CSVLoader

# Load a CSV file.
loader = CSVLoader('npb.csv')

# Define a Schema that defines types for each columns of the CSV file.
schema = Schema({
    'name': Schema.ID,
    'team': Schema.STRING,
}, Schema.NUMBER)

# Create a Dataset.
dataset = Dataset(loader, schema)

# Create an Recommender Service.
cfg = Config(method='lsh')
recommender = Recommender.run(cfg)

# Update the Recommender model.