def evaluate(X_train, X_test, y_train, y_test, n_trials=4): jubatus_methods = ['perceptron', 'PA', 'PA1', 'PA2', 'CW', 'AROW', 'NHERD'] sklearn_methods = [ 'Perceptron(sk)', 'PA1(sk)', 'PA2(sk)', 'LSVM(sk)', 'LR(sk)' ] results = dict.fromkeys(jubatus_methods + sklearn_methods, 0) vectorizer = TfidfVectorizer() for i in range(n_trials): X_train, y_train = shuffle(X_train, y_train, random_state=42) vec_X_train = vectorizer.fit_transform(X_train) vec_X_test = vectorizer.transform(X_test) for method in jubatus_methods: clf = linear_classifier(method=method) train_data = [(yi, Datum({'message': xi})) for (xi, yi) in zip(X_train, y_train)] test_data = [Datum({'message': xi}) for xi in X_test] clf.train(train_data) predictions = clf.classify(test_data) y_pred = [ max(pred, key=lambda x: x.score).label for pred in predictions ] test_score = accuracy_score(y_test, y_pred) print('{0:.3f}\t{1}'.format(test_score, method)) results[method] += test_score for method in sklearn_methods: clf = sklearn_linear_classifier(method=method) clf.fit(vec_X_train, y_train) test_score = accuracy_score(y_test, clf.predict(vec_X_test)) print('{0:.3f}\t{1}'.format(test_score, method)) results[method] += test_score results = {k: v / n_trials for k, v in results.items()} return results
def test_get_nearest_center(self): for i in range(0, 100): d = Datum({"nkey1": i, "nkey2": -i}) self.cli.push([IndexedPoint(str(i), d)]) q = Datum({"nkey1": 2.0, "nkey2": 1.0}) res = self.cli.get_nearest_center(q) self.assertTrue(isinstance(res, Datum))
def test_loadsave(self): x = Classifier(CONFIG) x.train([ LabeledDatum('Y', Datum({'x': 'y'})), LabeledDatum('N', Datum({'x': 'n'})), ]) path = '/tmp/127.0.0.1_0_classifier_hoge.jubatus' def _remove_model(): try: os.remove(path) except Exception: pass _remove_model() try: self.assertEqual( {'127.0.0.1_0': '/tmp/127.0.0.1_0_classifier_hoge.jubatus'}, x.save('hoge')) self.assertTrue(os.path.isfile(path)) x = Classifier(CONFIG) self.assertTrue(x.load('hoge')) y = x.classify([Datum({'x': 'y'}), Datum({'x': 'n'})]) self.assertEqual(['Y', 'N'], [ list(sorted(z, key=lambda x: x.score, reverse=True))[0].label for z in y ]) finally: _remove_model()
def test(self): w = Weight(CONFIG) d = Datum({'n0': 1, 'n1': 2, 'n2': 3, 't0': 'hello world'}) for r in [w.update(d), w.calc_weight(d)]: self.assertEqual(5, len(r)) for x in r: self.assertTrue(isinstance(x, Feature)) m = dict([(x.key, x.value) for x in r]) self.assertEqual(5, len(m)) self.assertEqual(1.0, m['n0@num']) self.assertAlmostEqual(math.log(2), m['n1@log']) self.assertEqual(1.0, m['n2@str$3']) w.update(Datum({'t1': 'hello world'})) w.update(Datum({'t1': 'foo bar'})) w.update(Datum({'t1': 'hello'})) d = Datum({'t1': 'foo bar hello world hoge'}) r0 = dict([(x.key, x.value) for x in w.calc_weight(d)]) model = w.save_bytes() w = Weight(CONFIG) w.load_bytes(model) self.assertEqual(CONFIG, json.loads(w.get_config())) r1 = dict([(x.key, x.value) for x in w.calc_weight(d)]) self.assertEqual(r0, r1)
def test_num(self): x = Classifier(CONFIG) self.assertEqual( 2, x.train([ ('Y', Datum({'x': 1})), ('N', Datum({'x': -1})), ])) def _test_classify(x): y = x.classify([Datum({'x': 1}), Datum({'x': -1})]) self.assertEqual(['Y', 'N'], [ list(sorted(z, key=lambda x: x.score, reverse=True))[0].label for z in y ]) self.assertEqual(x.get_labels(), {'N': 1, 'Y': 1}) _test_classify(x) model = x.save_bytes() x.clear() self.assertEqual({}, x.get_labels()) x.set_label('Y') x.set_label('N') self.assertEqual({'N': 0, 'Y': 0}, x.get_labels()) x.delete_label(u'Y') self.assertEqual({'N': 0}, x.get_labels()) x = Classifier(CONFIG) x.load_bytes(model) _test_classify(x) self.assertEqual(CONFIG, json.loads(x.get_config()))
def test_get_nearest_members_light(self): for i in range(0, 100): d = Datum({"nkey1": i, "nkey2": -i}) self.cli.push([IndexedPoint(str(i), d)]) q = Datum({"nkey1": 2.0, "nkey2": 1.0}) res = self.cli.get_nearest_members_light(q) self.assertTrue(isinstance(res[0], WeightedIndex))
def test(): def _valid_result(r): assert isinstance(r, Datum) d = dict(r.num_values) assert d.get('x', None) is not None and d.get('y', None) is not None x = Recommender(RECOMMENDER_CONFIG) x.update_row('0', Datum({'x': 0.9, 'y': 4.9})) x.update_row('1', Datum({'x': 1, 'y': 5})) x.update_row('2', Datum({'x': 1.15, 'y': 5.15})) x.update_row('3', Datum({'x': 1.2, 'y': 5.1})) x.update_row('4', Datum({'x': 1.05})) _valid_result(x.complete_row_from_datum(Datum({'x': 1.1}))) _valid_result(x.complete_row_from_id('4')) r = x.similar_row_from_id('2', 3) assert isinstance(r, list) assert isinstance(r[0], IdWithScore) r = x.similar_row_from_datum(Datum({'y': 5.05}), 3) assert isinstance(r, list) assert isinstance(r[0], IdWithScore) _valid_result(x.decode_row('0')) assert set(x.get_all_rows()) == set([str(i) for i in range(5)]) r = x.calc_similarity(Datum({'x': 1}), Datum({'y': 5})) assert isinstance(r, float) r = x.calc_l2norm(Datum({'x': 1, 'y': 5})) assert isinstance(r, float) x.clear() assert len(x.get_all_rows()) == 0 assert json.loads(x.get_config()) # エラーが発生しないことだけ確認 model = x.dump() x.load(model)
def _test_classify(x): y = x.classify([Datum({'x': 1}), Datum({'x': -1})]) self.assertEqual(['Y', 'N'], [ list(sorted(z, key=lambda x: x.score, reverse=True))[0].label for z in y ]) self.assertEqual(x.get_labels(), {'N': 1, 'Y': 1})
def test_add_string(self): d = Datum() d.add_string('key', 'value') self.assertEqual(Datum({'key': 'value'}).to_msgpack(), d.to_msgpack()) d = Datum() d.add_string(u('key'), u('value')) self.assertEqual(Datum({'key': 'value'}).to_msgpack(), d.to_msgpack())
def test(self): x = Regression(CONFIG) self.assertEqual( 5, x.train([ ScoredDatum(0.0, Datum({'x': 1.0})), ScoredDatum(1.0, Datum({'x': 2.0})), ScoredDatum(2.0, Datum({'x': 4.0})), ScoredDatum(3.0, Datum({'x': 8.0})), ScoredDatum(4.0, Datum({'x': 16.0})), ])) ret = x.estimate([ Datum({'x': 32.0}), Datum({'x': 1.5}), ]) self.assertEqual(2, len(ret)) self.assertTrue(ret[0] >= 8.0 and ret[0] < 9.0) self.assertTrue(ret[1] >= 0.0 and ret[1] < 1.0) self.assertEqual(CONFIG, json.loads(x.get_config())) model = x.save_bytes() x = Regression(CONFIG) x.load_bytes(model) self.assertEqual(ret, x.estimate([ Datum({'x': 32.0}), Datum({'x': 1.5}), ]))
def test(self): x = Regression(CONFIG) self.assertEqual( 5, x.train([ ScoredDatum(0.0, Datum({'x': 1.0})), ScoredDatum(1.0, Datum({'x': 2.0})), ScoredDatum(2.0, Datum({'x': 4.0})), ScoredDatum(3.0, Datum({'x': 8.0})), ScoredDatum(4.0, Datum({'x': 16.0})), ])) ret = x.estimate([ Datum({'x': 32.0}), Datum({'x': 1.5}), ]) self.assertEqual(2, len(ret)) self.assertTrue(ret[0] >= 8.0 and ret[0] < 9.0) self.assertTrue(ret[1] >= 0.0 and ret[1] < 1.0) self.assertEqual(CONFIG, json.loads(x.get_config())) model = x.save_bytes() x = Regression(CONFIG) x.load_bytes(model) self.assertEqual(ret, x.estimate([ Datum({'x': 32.0}), Datum({'x': 1.5}), ])) st = x.get_status() self.assertTrue(isinstance(st, dict)) self.assertEqual(len(st), 1) self.assertEqual(list(st.keys())[0], 'embedded') self.assertTrue(isinstance(st['embedded'], dict))
def test_classifier_str(): x = Classifier(CLASSIFIER_CONFIG) x.train([ (u'Y', Datum({'x': u'y'})), (u'N', Datum({'x': u'n'})), ]) y = x.classify([Datum({'x': 'y'}), Datum({'x': 'n'})]) assert [ list(sorted(z, key=lambda x: x.score, reverse=True))[0].label for z in y ] == ['Y', 'N']
def test_str(self): x = Classifier(CONFIG) self.assertEqual( 2, x.train([ ('Y', Datum({'x': 'y'})), ('N', Datum({'x': 'n'})), ])) y = x.classify([Datum({'x': 'y'}), Datum({'x': 'n'})]) self.assertEqual(['Y', 'N'], [ list(sorted(z, key=lambda x: x.score, reverse=True))[0].label for z in y ])
def test_types(self): x = Classifier(CONFIG) x.train([ LabeledDatum('Y', Datum({'x': 'y'})), LabeledDatum('N', Datum({'x': 'n'})), ]) y = x.classify([Datum({'x': 'y'}), Datum({'x': 'n'})]) self.assertTrue(isinstance(y[0][0], EstimateResult)) self.assertEqual(['Y', 'N'], [ list(sorted(z, key=lambda x: x.score, reverse=True))[0].label for z in y ])
def predict(client): # predict the last shogun data = [ Datum({'name': '慶喜'}), Datum({'name': '義昭'}), Datum({'name': '守時'}), ] for d in data: res = client.classify([d]) # get the predicted shogun name shogun_name = max(res[0], key=lambda x: x.score).label first_name = d.string_values[0][1] _output('{0} {1}\n'.format(shogun_name, first_name))
def test(): x = Clustering(CLUSTERING_CONFIG) assert x.get_revision() == 0 assert x.push([ Datum({'x': 1.0}), Datum({'x': 0.9}), Datum({'x': 1.1}), Datum({'x': 5.0}), Datum({'x': 4.9}), Datum({'x': 5.1}), ]) assert x.get_revision() == 1 centers = x.get_k_center() assert isinstance(centers, list) and len(centers) == 2 assert isinstance(centers[0], Datum) members = x.get_core_members() assert isinstance(members, list) and len(members) == 2 assert isinstance(members[0], list) assert isinstance(members[0][0], WeightedDatum) c = x.get_nearest_center(Datum({'x': 1.05})) assert isinstance(c, Datum) assert c.num_values[0][1] >= 0.9 and c.num_values[0][1] <= 1.1 c = x.get_nearest_members(Datum({'x': 1.05})) assert isinstance(c, list) assert isinstance(c[0], WeightedDatum) assert json.loads(x.get_config()) # エラーが発生しないことだけ確認 model = x.dump() x.load(model)
def predict(client): # predict the last shogun data = [ Datum({'name': u'慶喜'}), Datum({'name': u'義昭'}), Datum({'name': u'守時'}), ] for d in data: res = client.classify([d]) # get the predicted shogun name sys.stdout.write(max(res[0], key=lambda x: x.score).label) sys.stdout.write(' ') sys.stdout.write(d.string_values[0][1].encode('utf-8')) sys.stdout.write('\n')
def test_decode_row(self): self.cli.clear_row("decode_row") d = Datum({"skey1": "val1", "skey2": "val2", "nkey1": 1.0, "nkey2": 2.0}) self.cli.update_row("decode_row", d) decoded_row = self.cli.decode_row("decode_row") self.assertEqual(json.dumps(d.string_values), json.dumps(decoded_row.string_values)) self.assertEqual(json.dumps(d.num_values), json.dumps(decoded_row.num_values))
def test_pack(self): self.assertEqual( msgpack.packb(([['name', 'Taro']], [['age', 20.0]], [])), msgpack.packb(Datum({ 'name': 'Taro', 'age': 20 }).to_msgpack()))
def test_get_k_center(self): for i in range(0, 100): d = Datum({"nkey1": i, "nkey2": -i}) self.cli.push([IndexedPoint(str(i), d)]) res = self.cli.get_k_center() self.assertEqual(len(res), 10) self.assertTrue(isinstance(res[0], Datum))
def test_get_core_members_light(self): for i in range(0, 100): d = Datum({"nkey1": i, "nkey2": -i}) self.cli.push([IndexedPoint(str(i), d)]) res = self.cli.get_core_members_light() self.assertEqual(len(res), 10) self.assertTrue(isinstance(res[0][0], WeightedIndex))
def predict(self, client): getpre = preMongo() dic_pre = getpre.getDic() data = [] predict_result = {} for line in dic_pre: value = dic_pre[line]['Value'] data.append(Datum({'Value':value})) for d in data: res = client.classify([d]) # getmongo.postDB(max(res[0], key=lambda x: x.score).label, str(d.num_values[0][1])) # sys.stdout.write(max(res[0], key=lambda x: x.score).label) # sys.stdout.write(' ') # sys.stdout.write(str(d.num_values[0][1])) # sys.stdout.write('\n') hoge = str(d.num_values[0][1]) result = max(res[0], key=lambda x: x.score).label predict_result.update({'Result': result, 'Value' : hoge}) return predict_result['Result']
def add_data(num=10): data = generate_data(num) client = jubatus.Anomaly(HOST, PORT_P, NAME) for d in data: dt = Datum({"x": d[0], "y": d[1], "z": d[2]}) result = client.add(dt) print('Added {0}, score = {1}'.format(result.id, result.score))
def on_status(self, status): if not hasattr(status, 'text'): return if not hasattr(status, 'coordinates'): return if not status.coordinates or not 'coordinates' in status.coordinates: return loc = None for l in self.locations: coordinates = status.coordinates['coordinates'] if l.is_inside(coordinates[0], coordinates[1]): loc = l break if not loc: # Unknown location return hashtags = status.entities['hashtags'] detagged_text = remove_hashtags_from_tweet(status.text, hashtags) # Create datum for Jubatus d = Datum({'text': detagged_text}) # Send training data to Jubatus self.classifier.train([(loc.name, d)]) # Print trained tweet print_green(loc.name, ' ') print(detagged_text)
def make_datum(row, args): title, authors, groups, keywords, topics, abstract = row d = Datum() d.add_string("title", title) if args.abstract: d.add_string("abstract", abstract) return d
def train_and_predict(client, file): input_data = [] number_of_samples = 0 with open(file) as tsv: line = tsv.readline() header = line[:-1].split("\t") for line in tsv: if line[8:11] == 'CHB': input_data.append(line) number_of_samples += 1 elif line[8:11] == 'JPT': input_data.append(line) number_of_samples += 1 else: continue shuffled_numbers = numpy.arange(number_of_samples) for epoch in range(number_of_epoch): random.shuffle(shuffled_numbers) random.shuffle(shuffled_numbers) random.shuffle(shuffled_numbers) for i in shuffled_numbers: fields = input_data[i][:-1].split("\t") if fields[0][0:7] == exclude: predict_data = [] predict = {} answer = fields[0][8:11] for j in range(1, len(fields)): fields[j] = float(fields[j]) predict.update({header[j]: fields[j]}) predict_data.append((Datum(predict))) else: train_data = [] trains = {} for j in range(1, len(fields)): fields[j] = float(fields[j]) trains.update({header[j]: fields[j]}) train_data.append((fields[0][8:11], Datum(trains))) client.train(train_data) result = client.classify([predict_data[0]]) predicted = max(result[0], key=lambda x: x.score).label if answer == predicted: print('correct', end="\t") else: print('wrong', end="\t") print(answer, predicted, result, sep="\t")
def test(self): x = NearestNeighbor(CONFIG) self.assertTrue(x.set_row("a0", Datum({'x': 0}))) self.assertTrue(x.set_row("a1", Datum({'x': 0.25}))) self.assertTrue(x.set_row("a2", Datum({'x': 0.5}))) self.assertTrue(x.set_row("a3", Datum({'x': 1}))) self.assertTrue(x.set_row("b0", Datum({'x': 10}))) self.assertTrue(x.set_row("b1", Datum({'x': 10.25}))) self.assertTrue(x.set_row("b2", Datum({'x': 10.5}))) self.assertTrue(x.set_row("b3", Datum({'x': 11}))) def _check_prefix(expected, lst): for x in lst: self.assertTrue(isinstance(x, IdWithScore)) self.assertTrue(x.id.startswith(expected)) ret = x.neighbor_row_from_id("a0", 3) self.assertEqual(3, len(ret)) _check_prefix('a', ret) ret = x.neighbor_row_from_datum(Datum({'x': 0.25}), 3) self.assertEqual(3, len(ret)) _check_prefix('a', ret) ret = x.similar_row_from_id("b3", 3) self.assertEqual(3, len(ret)) _check_prefix('b', ret) ret = x.similar_row_from_datum(Datum({'x': 11}), 3) self.assertEqual(3, len(ret)) _check_prefix('b', ret) self.assertEqual(set(['a0', 'a1', 'a2', 'a3', 'b0', 'b1', 'b2', 'b3']), set(x.get_all_rows())) self.assertEqual(CONFIG, json.loads(x.get_config())) model = x.save_bytes() x = NearestNeighbor(CONFIG) x.load_bytes(model) self.assertEqual(set(['a0', 'a1', 'a2', 'a3', 'b0', 'b1', 'b2', 'b3']), set(x.get_all_rows())) st = x.get_status() self.assertTrue(isinstance(st, dict)) self.assertEqual(len(st), 1) self.assertEqual(list(st.keys())[0], 'embedded') self.assertTrue(isinstance(st['embedded'], dict))
def test_add_bulk(self): x = Anomaly(CONFIG) data = [ Datum({'x': 0.0999}), Datum({'x': 0.1}), Datum({'x': -0.1009}), Datum({'x': -0.101}), Datum({'x': 0.1011}), ] ret = x.add_bulk(data) self.assertEqual(['0', '1', '2', '3', '4'], ret) self.assertEqual(set(ret), set(x.get_all_rows())) x = Anomaly(CONFIG) x.fit(np.array([[d.num_values[0][1]] for d in data])) self.assertEqual(['0', '1', '2', '3', '4'], ret) self.assertEqual(set(ret), set(x.get_all_rows()))
def test(self): x = Clustering(CONFIG) self.assertEqual(0, x.get_revision()) self.assertTrue(x.push([ IndexedPoint('a', Datum({'x': 1.0})), IndexedPoint('b', Datum({'x': 0.9})), IndexedPoint('c', Datum({'x': 1.1})), IndexedPoint('d', Datum({'x': 5.0})), IndexedPoint('e', Datum({'x': 4.9})), IndexedPoint('f', Datum({'x': 5.1})), ])) self.assertEqual(1, x.get_revision()) centers = x.get_k_center() self.assertTrue(isinstance(centers, list)) self.assertEqual(2, len(centers)) self.assertTrue(isinstance(centers[0], Datum)) members = x.get_core_members() self.assertTrue(isinstance(members, list)) self.assertEqual(2, len(members)) self.assertTrue(isinstance(members[0], list)) self.assertTrue(isinstance(members[0][0], WeightedDatum)) c = x.get_nearest_center(Datum({'x': 1.05})) self.assertTrue(isinstance(c, Datum)) self.assertTrue(c.num_values[0][1] >= 0.9 and c.num_values[0][1] <= 1.1) c = x.get_nearest_members(Datum({'x': 1.05})) self.assertTrue(isinstance(c, list)) self.assertTrue(isinstance(c[0], WeightedDatum)) c = x.get_core_members_light() self.assertTrue(isinstance(c, list)) self.assertTrue(isinstance(c[0], list)) self.assertTrue(isinstance(c[0][0], WeightedIndex)) c = x.get_nearest_members_light(Datum({'x': 1.05})) self.assertTrue(isinstance(c, list)) self.assertTrue(isinstance(c[0], WeightedIndex)) model = x.save_bytes() x = Clustering(CONFIG) x.load_bytes(model) self.assertEqual(CONFIG, json.loads(x.get_config())) self.assertEqual(1, x.get_revision()) self.assertEqual(len(centers), len(x.get_k_center())) st = x.get_status() self.assertTrue(isinstance(st, dict)) self.assertEqual(len(st), 1) self.assertEqual(list(st.keys())[0], 'embedded') self.assertTrue(isinstance(st['embedded'], dict))
def predict_min(l): res = client.classify( [Datum({ u'article': l['article'], u'HeadLine': l['HeadLine'] })]) pred = min(res[0], key=lambda x: x.score) return pred.label, pred.score