def test_cp932(self): with TempFile() as f: f.write("テスト1,テスト2".encode('cp932')) f.flush() loader = CSVLoader(f.name, None, 'cp932') for row in loader: self.assertEqual('テスト1', row['c0']) self.assertEqual('テスト2', row['c1'])
def test_restkey(self): with TempFile() as f: f.write("テスト1,テスト2\nテスト1,テスト2".encode('utf-8')) f.flush() loader = CSVLoader(f.name, ['c1'], 'utf-8', restkey='garbage') for row in loader: self.assertEqual('テスト1', row['c1']) self.assertEqual(['テスト2'], row['garbage'])
def test_restval(self): with TempFile() as f: f.write("テスト1,テスト2\nテスト1,テスト2".encode('utf-8')) f.flush() loader = CSVLoader(f.name, ['c1', 'c2', 'c3'], 'utf-8', restval='<blank>') for row in loader: self.assertEqual('テスト1', row['c1']) self.assertEqual('テスト2', row['c2']) self.assertEqual('<blank>', row['c3'])
def test_unicode_separator(self): with TempFile() as f: f.write("v1★v2\ns1★s2\n".encode('utf-8')) f.flush() loader = CSVLoader(f.name, delimiter='★') lines = 0 for row in loader: lines += 1 self.assertEqual('s1', row['v1']) self.assertEqual('s2', row['v2']) self.assertEqual(1, lines)
def test_cp932_seq_fieldnames(self): with TempFile() as f: f.write("テスト1,テスト2\nテスト1,テスト2".encode('cp932')) f.flush() # assign sequential field names loader = CSVLoader(f.name, False, 'cp932', delimiter=',') lines = 0 for row in loader: lines += 1 self.assertEqual('テスト1', row['c0']) self.assertEqual('テスト2', row['c1']) self.assertEqual(2, lines)
def test_cp932(self): with TempFile() as f: f.write("列1,列2\nテスト1,テスト2\n".encode('cp932')) f.flush() # predict field names from 1st row loader = CSVLoader(f.name, None, 'cp932', delimiter=',') lines = 0 for row in loader: lines += 1 self.assertEqual('テスト1', row['列1']) self.assertEqual('テスト2', row['列2']) self.assertEqual(1, lines)
def test_cp932_manual_fieldnames(self): with TempFile() as f: f.write("テスト1,テスト2\nテスト1,テスト2".encode('cp932')) f.flush() # assign field names statically loader = CSVLoader(f.name, ['列1', '列2'], 'cp932', delimiter=',') lines = 0 for row in loader: lines += 1 self.assertEqual('テスト1', row['列1']) self.assertEqual('テスト2', row['列2']) self.assertEqual(2, lines)
def test_simple(self): with TempFile() as f: f.write("k1,\"k2\",k3\n1,2,3\n4,5,6".encode('utf-8')) f.flush() loader = CSVLoader(f.name) for row in loader: self.assertEqual(set(['k1', 'k2', 'k3']), set(row.keys())) if row['k1'] == '1': self.assertEqual('2', row['k2']) self.assertEqual('3', row['k3']) elif row['k1'] == '4': self.assertEqual('5', row['k2']) self.assertEqual('6', row['k3']) else: self.fail('unexpected row')
def test_noheader(self): with TempFile() as f: f.write("1,\"2\",3\n\"4\",5,\"6\"".encode('utf-8')) f.flush() loader = CSVLoader(f.name, False) for row in loader: self.assertEqual(set(['c0', 'c1', 'c2']), set(row.keys())) if row['c0'] == '1': self.assertEqual('2', row['c1']) self.assertEqual('3', row['c2']) elif row['c0'] == '4': self.assertEqual('5', row['c1']) self.assertEqual('6', row['c2']) else: self.fail('unexpected row')
def test_guess_header(self): with TempFile() as f: f.write("k1|k2|k3\n1|2|3".encode()) f.flush() loader = CSVLoader(f.name, fieldnames=True, delimiter='|') self.assertEqual([{'k1': '1', 'k2': '2', 'k3': '3'}], list(loader))
This example uses bulk train-test method of Classifier. """ import sklearn.metrics from jubakit.classifier import Classifier, Schema, Dataset, Config from jubakit.loader.csv import CSVLoader import jubakit.logger # In this example, we enable logging mechanism to show you # what's going on in jubakit. jubakit.logger.setup_logger(jubakit.logger.INFO) # Load a CSV file. loader = CSVLoader('iris.csv') # Define a Schema that defines types for each columns of the CSV file. schema = Schema({ 'Species': Schema.LABEL, }, Schema.NUMBER) # Display Schema print('Schema: {0}'.format(schema)) # Create a Dataset. dataset = Dataset(loader, schema).shuffle() n_samples = len(dataset) n_train_samples = int(n_samples / 2) # Create a Classifier configuration.
We use AAAI 2014 Accepted Papers Data Set. Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science. Please download csv file from the url and rename it "aaai.csv" https://archive.ics.uci.edu/ml/datasets/AAAI+2014+Accepted+Papers """ from jubakit.nearest_neighbor import NearestNeighbor, Schema, Dataset, Config from jubakit.loader.csv import CSVLoader # Load a CSV file. loader = CSVLoader('aaai.csv') # Define a Schema that defines types for each columns of the CSV file. # In this example, we use "abstract" and "topics" to calculate neighbor scores. schema = Schema( { 'title': Schema.ID, 'abstract': Schema.STRING, 'keyword': Schema.STRING }, Schema.IGNORE) print('Schema:', schema) # Create a Dataset. dataset = Dataset(loader, schema) # Create a nearest neighbor configuration.
# -*- coding: utf-8 -*- from __future__ import absolute_import, division, print_function, unicode_literals """ Using Classifier and String Features ======================================== This is a famous `shogun` classifier example that predicts family name of Shogun from his first name. """ from jubakit.classifier import Classifier, Schema, Dataset, Config from jubakit.loader.csv import CSVLoader # Load the shogun dataset. train_loader = CSVLoader('shogun.train.csv') test_loader = CSVLoader('shogun.test.csv') # Define a Schema that defines types for each columns of the CSV file. schema = Schema({ 'family_name': Schema.LABEL, 'first_name': Schema.STRING, }) # Create a Dataset. train_dataset = Dataset(train_loader, schema).shuffle() test_dataset = Dataset(test_loader, schema) # Create a Classifier Service. cfg = Config(method='PA', converter={
# -*- coding: utf-8 -*- from __future__ import absolute_import, division, print_function, unicode_literals """ Using Weight Service ==================== This example illustrates how to use Weight engine to debug fv_converter behavior (i.e. `converter` section of the config file). """ from jubakit.weight import Weight, Schema, Dataset, Config from jubakit.loader.csv import CSVLoader # Load a CSV file. loader = CSVLoader('shogun.train.csv') # Create a Dataset; schema will be auto-predicted. dataset = Dataset(loader) # Create a Weight Service. cfg = Config() weight = Weight.run(cfg) # Show extracted features. As we use `update` method, weights are # updated incrementally. print('==== Features (online TF-IDF) ========================') for (idx, result) in weight.update(dataset): print('Raw Data:') print('\tfamily_name: {0}'.format(dataset.get(idx)['family_name'])) print('\tfirst_name: {0}'.format(dataset.get(idx)['first_name']))
# -*- coding: utf-8 -*- from __future__ import absolute_import, division, print_function, unicode_literals """ Using Clustering ======================================== This is a simple example that illustrates Clustering service usage. """ from jubakit.clustering import Clustering, Schema, Dataset, Config from jubakit.loader.csv import CSVLoader # Load a CSV file. loader = CSVLoader('blobs.csv') # Define a Schema that defines types for each columns of the CSV file. schema = Schema({ 'cluster': Schema.ID, }, Schema.NUMBER) # Create a Dataset. dataset = Dataset(loader, schema) # Create an Clustering Service. cfg = Config(method='kmeans') clustering = Clustering.run(cfg) # Update the Clustering model. for (idx, row_id, result) in clustering.push(dataset):
======================================== This is a simple example that illustrates: * How to load CSV files and convert it into Jubakit dataset. * Register keywords to the burst client using the keyword dataset. * Add documents to the burst client using the document dataset. * Getting burst result. """ from jubakit.burst import KeywordSchema, KeywordDataset from jubakit.burst import DocumentSchema, DocumentDataset from jubakit.burst import Burst, Config from jubakit.loader.csv import CSVLoader keyword_loader = CSVLoader('burst_keywords.csv') keyword_schema = KeywordSchema({ 'keyword': KeywordSchema.KEYWORD, 'scaling': KeywordSchema.SCALING, 'gamma': KeywordSchema.GAMMA }) keyword_dataset = KeywordDataset(keyword_loader, keyword_schema) document_loader = CSVLoader('burst_documents.csv') document_schema = DocumentSchema({ 'position': DocumentSchema.POSITION, 'text': DocumentSchema.TEXT }) document_dataset = DocumentDataset(document_loader, document_schema) burst = Burst.run(Config())
Using Regression and CSV file ================================================== This is a simple example that illustrates: * How to load CSV files and convert int into Jubakit dataset. * Training the regression using the dataset. * Getting regression result. """ import numpy as np from jubakit.regression import Regression, Schema, Dataset, Config from jubakit.loader.csv import CSVLoader # Load a CSV file. loader = CSVLoader('wine.csv') # Define a Schema that defines types for each columns of the CSV file. schema = Schema({ 'quality': Schema.TARGET, }, Schema.NUMBER) # Create a Dataset dataset = Dataset(loader, schema).shuffle() n_samples = len(dataset) n_train_samples = int(n_samples * 0.75) # Create a Regression Service cfg = Config.default() regression = Regression.run(cfg)
# -*- coding: utf-8 -*- from __future__ import absolute_import, division, print_function, unicode_literals """ Using Recommender ======================================== This is a simple example that illustrates Recommender service usage. """ from jubakit.recommender import Recommender, Schema, Dataset, Config from jubakit.loader.csv import CSVLoader # Load a CSV file. loader = CSVLoader('npb.csv') # Define a Schema that defines types for each columns of the CSV file. schema = Schema({ 'name': Schema.ID, 'team': Schema.STRING, }, Schema.NUMBER) # Create a Dataset. dataset = Dataset(loader, schema) # Create an Recommender Service. cfg = Config(method='lsh') recommender = Recommender.run(cfg) # Update the Recommender model.