def evaluate(X_train, X_test, y_train, y_test, n_trials=4): jubatus_methods = ['perceptron', 'PA', 'PA1', 'PA2', 'CW', 'AROW', 'NHERD'] sklearn_methods = [ 'Perceptron(sk)', 'PA1(sk)', 'PA2(sk)', 'LSVM(sk)', 'LR(sk)' ] results = dict.fromkeys(jubatus_methods + sklearn_methods, 0) vectorizer = TfidfVectorizer() for i in range(n_trials): X_train, y_train = shuffle(X_train, y_train, random_state=42) vec_X_train = vectorizer.fit_transform(X_train) vec_X_test = vectorizer.transform(X_test) for method in jubatus_methods: clf = linear_classifier(method=method) train_data = [(yi, Datum({'message': xi})) for (xi, yi) in zip(X_train, y_train)] test_data = [Datum({'message': xi}) for xi in X_test] clf.train(train_data) predictions = clf.classify(test_data) y_pred = [ max(pred, key=lambda x: x.score).label for pred in predictions ] test_score = accuracy_score(y_test, y_pred) print('{0:.3f}\t{1}'.format(test_score, method)) results[method] += test_score for method in sklearn_methods: clf = sklearn_linear_classifier(method=method) clf.fit(vec_X_train, y_train) test_score = accuracy_score(y_test, clf.predict(vec_X_test)) print('{0:.3f}\t{1}'.format(test_score, method)) results[method] += test_score results = {k: v / n_trials for k, v in results.items()} return results
def test(): def _valid_result(r): assert isinstance(r, Datum) d = dict(r.num_values) assert d.get('x', None) is not None and d.get('y', None) is not None x = Recommender(RECOMMENDER_CONFIG) x.update_row('0', Datum({'x': 0.9, 'y': 4.9})) x.update_row('1', Datum({'x': 1, 'y': 5})) x.update_row('2', Datum({'x': 1.15, 'y': 5.15})) x.update_row('3', Datum({'x': 1.2, 'y': 5.1})) x.update_row('4', Datum({'x': 1.05})) _valid_result(x.complete_row_from_datum(Datum({'x': 1.1}))) _valid_result(x.complete_row_from_id('4')) r = x.similar_row_from_id('2', 3) assert isinstance(r, list) assert isinstance(r[0], IdWithScore) r = x.similar_row_from_datum(Datum({'y': 5.05}), 3) assert isinstance(r, list) assert isinstance(r[0], IdWithScore) _valid_result(x.decode_row('0')) assert set(x.get_all_rows()) == set([str(i) for i in range(5)]) r = x.calc_similarity(Datum({'x': 1}), Datum({'y': 5})) assert isinstance(r, float) r = x.calc_l2norm(Datum({'x': 1, 'y': 5})) assert isinstance(r, float) x.clear() assert len(x.get_all_rows()) == 0 assert json.loads(x.get_config()) # エラーが発生しないことだけ確認 model = x.dump() x.load(model)
def test_get_nearest_members_light(self): for i in range(0, 100): d = Datum({"nkey1": i, "nkey2": -i}) self.cli.push([IndexedPoint(str(i), d)]) q = Datum({"nkey1": 2.0, "nkey2": 1.0}) res = self.cli.get_nearest_members_light(q) self.assertTrue(isinstance(res[0], WeightedIndex))
def test_num(self): x = Classifier(CONFIG) self.assertEqual( 2, x.train([ ('Y', Datum({'x': 1})), ('N', Datum({'x': -1})), ])) def _test_classify(x): y = x.classify([Datum({'x': 1}), Datum({'x': -1})]) self.assertEqual(['Y', 'N'], [ list(sorted(z, key=lambda x: x.score, reverse=True))[0].label for z in y ]) self.assertEqual(x.get_labels(), {'N': 1, 'Y': 1}) _test_classify(x) model = x.save_bytes() x.clear() self.assertEqual({}, x.get_labels()) x.set_label('Y') x.set_label('N') self.assertEqual({'N': 0, 'Y': 0}, x.get_labels()) x.delete_label(u'Y') self.assertEqual({'N': 0}, x.get_labels()) x = Classifier(CONFIG) x.load_bytes(model) _test_classify(x) self.assertEqual(CONFIG, json.loads(x.get_config()))
def test(self): w = Weight(CONFIG) d = Datum({'n0': 1, 'n1': 2, 'n2': 3, 't0': 'hello world'}) for r in [w.update(d), w.calc_weight(d)]: self.assertEqual(5, len(r)) for x in r: self.assertTrue(isinstance(x, Feature)) m = dict([(x.key, x.value) for x in r]) self.assertEqual(5, len(m)) self.assertEqual(1.0, m['n0@num']) self.assertAlmostEqual(math.log(2), m['n1@log']) self.assertEqual(1.0, m['n2@str$3']) w.update(Datum({'t1': 'hello world'})) w.update(Datum({'t1': 'foo bar'})) w.update(Datum({'t1': 'hello'})) d = Datum({'t1': 'foo bar hello world hoge'}) r0 = dict([(x.key, x.value) for x in w.calc_weight(d)]) model = w.save_bytes() w = Weight(CONFIG) w.load_bytes(model) self.assertEqual(CONFIG, json.loads(w.get_config())) r1 = dict([(x.key, x.value) for x in w.calc_weight(d)]) self.assertEqual(r0, r1)
def test_get_nearest_center(self): for i in range(0, 100): d = Datum({"nkey1": i, "nkey2": -i}) self.cli.push([IndexedPoint(str(i), d)]) q = Datum({"nkey1": 2.0, "nkey2": 1.0}) res = self.cli.get_nearest_center(q) self.assertTrue(isinstance(res, Datum))
def test_loadsave(self): x = Classifier(CONFIG) x.train([ LabeledDatum('Y', Datum({'x': 'y'})), LabeledDatum('N', Datum({'x': 'n'})), ]) path = '/tmp/127.0.0.1_0_classifier_hoge.jubatus' def _remove_model(): try: os.remove(path) except Exception: pass _remove_model() try: self.assertEqual( {'127.0.0.1_0': '/tmp/127.0.0.1_0_classifier_hoge.jubatus'}, x.save('hoge')) self.assertTrue(os.path.isfile(path)) x = Classifier(CONFIG) self.assertTrue(x.load('hoge')) y = x.classify([Datum({'x': 'y'}), Datum({'x': 'n'})]) self.assertEqual(['Y', 'N'], [ list(sorted(z, key=lambda x: x.score, reverse=True))[0].label for z in y ]) finally: _remove_model()
def _test_classify(x): y = x.classify([Datum({'x': 1}), Datum({'x': -1})]) self.assertEqual(['Y', 'N'], [ list(sorted(z, key=lambda x: x.score, reverse=True))[0].label for z in y ]) self.assertEqual(x.get_labels(), {'N': 1, 'Y': 1})
def make_datum(data, headers): ''' ヘッダのリストとデータの行を1 行受け取り、 datum を作成する関数 ''' d = Datum() for k in headers: d.add_number(k, data[k]) return d
def test(self): x = Regression(CONFIG) self.assertEqual( 5, x.train([ ScoredDatum(0.0, Datum({'x': 1.0})), ScoredDatum(1.0, Datum({'x': 2.0})), ScoredDatum(2.0, Datum({'x': 4.0})), ScoredDatum(3.0, Datum({'x': 8.0})), ScoredDatum(4.0, Datum({'x': 16.0})), ])) ret = x.estimate([ Datum({'x': 32.0}), Datum({'x': 1.5}), ]) self.assertEqual(2, len(ret)) self.assertTrue(ret[0] >= 8.0 and ret[0] < 9.0) self.assertTrue(ret[1] >= 0.0 and ret[1] < 1.0) self.assertEqual(CONFIG, json.loads(x.get_config())) model = x.save_bytes() x = Regression(CONFIG) x.load_bytes(model) self.assertEqual(ret, x.estimate([ Datum({'x': 32.0}), Datum({'x': 1.5}), ])) st = x.get_status() self.assertTrue(isinstance(st, dict)) self.assertEqual(len(st), 1) self.assertEqual(list(st.keys())[0], 'embedded') self.assertTrue(isinstance(st['embedded'], dict))
def test(self): x = Regression(CONFIG) self.assertEqual( 5, x.train([ ScoredDatum(0.0, Datum({'x': 1.0})), ScoredDatum(1.0, Datum({'x': 2.0})), ScoredDatum(2.0, Datum({'x': 4.0})), ScoredDatum(3.0, Datum({'x': 8.0})), ScoredDatum(4.0, Datum({'x': 16.0})), ])) ret = x.estimate([ Datum({'x': 32.0}), Datum({'x': 1.5}), ]) self.assertEqual(2, len(ret)) self.assertTrue(ret[0] >= 8.0 and ret[0] < 9.0) self.assertTrue(ret[1] >= 0.0 and ret[1] < 1.0) self.assertEqual(CONFIG, json.loads(x.get_config())) model = x.save_bytes() x = Regression(CONFIG) x.load_bytes(model) self.assertEqual(ret, x.estimate([ Datum({'x': 32.0}), Datum({'x': 1.5}), ]))
def test_classifier_str(): x = Classifier(CLASSIFIER_CONFIG) x.train([ (u'Y', Datum({'x': u'y'})), (u'N', Datum({'x': u'n'})), ]) y = x.classify([Datum({'x': 'y'}), Datum({'x': 'n'})]) assert [ list(sorted(z, key=lambda x: x.score, reverse=True))[0].label for z in y ] == ['Y', 'N']
def test_types(self): x = Classifier(CONFIG) x.train([ LabeledDatum('Y', Datum({'x': 'y'})), LabeledDatum('N', Datum({'x': 'n'})), ]) y = x.classify([Datum({'x': 'y'}), Datum({'x': 'n'})]) self.assertTrue(isinstance(y[0][0], EstimateResult)) self.assertEqual(['Y', 'N'], [ list(sorted(z, key=lambda x: x.score, reverse=True))[0].label for z in y ])
def test_str(self): x = Classifier(CONFIG) self.assertEqual( 2, x.train([ ('Y', Datum({'x': 'y'})), ('N', Datum({'x': 'n'})), ])) y = x.classify([Datum({'x': 'y'}), Datum({'x': 'n'})]) self.assertEqual(['Y', 'N'], [ list(sorted(z, key=lambda x: x.score, reverse=True))[0].label for z in y ])
def main(): args = parse_options() client = Classifier('127.0.0.1', args.port, 'test', 0) d = Datum() # Learn same data rand = random.randint(0, 1) d.add_number('key', 1.0 if rand else 2.0) print client.classify([d]) print client.get_labels()
def get_traindata(labels): traindata = [] for index in range(n_train): imgfile = "{}.jpg" img = os.path.join(dir,imgfile.format(index)) with open(img,"rb") as f: binary = f.read() label = labels[index][1] d = Datum() d.add_binary("image",binary) traindata.append([label,d]) print ("num of train data :",len(traindata)) return traindata
def convert(self, args): if len(args) % 2 != 0: raise ValueError('value for the last datum key ({0}) is missing'.format(args[len(args) - 1])) d = Datum() for i in range(int(len(args) / 2)): feat_key = args[i*2] feat_val = args[i*2+1] try: d.add_number(feat_key, float(feat_val)) except ValueError: d.add_string(feat_key, feat_val) return (len(args), d)
def get_traindata(labels): traindata = [] for index in range(n_train): img = dir+str(index)+".jpg" with open(img,"rb") as f: binary = f.read() label = labels[index][1] print label,index d = Datum() d.add_binary("image",binary) traindata.append([label,d]) print "num of train data :",len(traindata) return traindata
def get_testdata(labels): testdata = [] testlabels = [] for index in range(n_train,(n_train+n_test)): img = dir+str(index)+".jpg" with open(img,"rb") as f: binary = f.read() d = Datum() d.add_binary("image",binary) testdata.append([d]) testlabels.append(labels[index][1]) print "num of test data :",len(testdata) return testdata
def get_traindata(labels): traindata = [] for index in range(n_train): imgfile = "{}.jpg" img = os.path.join(dir, imgfile.format(index)) with open(img, "rb") as f: binary = f.read() label = labels[index][1] d = Datum() d.add_binary("image", binary) traindata.append([label, d]) print("num of train data :", len(traindata)) return traindata
def predict(client): # predict the last shogun data = [ Datum({'name': '慶喜'}), Datum({'name': '義昭'}), Datum({'name': '守時'}), ] for d in data: res = client.classify([d]) # get the predicted shogun name shogun_name = max(res[0], key=lambda x: x.score).label first_name = d.string_values[0][1] _output('{0} {1}\n'.format(shogun_name, first_name))
def test(): x = Clustering(CLUSTERING_CONFIG) assert x.get_revision() == 0 assert x.push([ Datum({'x': 1.0}), Datum({'x': 0.9}), Datum({'x': 1.1}), Datum({'x': 5.0}), Datum({'x': 4.9}), Datum({'x': 5.1}), ]) assert x.get_revision() == 1 centers = x.get_k_center() assert isinstance(centers, list) and len(centers) == 2 assert isinstance(centers[0], Datum) members = x.get_core_members() assert isinstance(members, list) and len(members) == 2 assert isinstance(members[0], list) assert isinstance(members[0][0], WeightedDatum) c = x.get_nearest_center(Datum({'x': 1.05})) assert isinstance(c, Datum) assert c.num_values[0][1] >= 0.9 and c.num_values[0][1] <= 1.1 c = x.get_nearest_members(Datum({'x': 1.05})) assert isinstance(c, list) assert isinstance(c[0], WeightedDatum) assert json.loads(x.get_config()) # エラーが発生しないことだけ確認 model = x.dump() x.load(model)
def get_testdata(labels, data): testdata = [] with open(data, "rb") as f: for i in range(len(labels)): binary = f.read(28 * 28) d = Datum() d.add_binary("image", binary) # reader = csv.reader(f) # for row in reader: # d = Datum() # d.add_binary("image",row) testdata.append([d]) print "num of test data :", len(testdata) return testdata
def predict(client): # predict the last shogun data = [ Datum({'name': u'慶喜'}), Datum({'name': u'義昭'}), Datum({'name': u'守時'}), ] for d in data: res = client.classify([d]) # get the predicted shogun name sys.stdout.write(max(res[0], key=lambda x: x.score).label) sys.stdout.write(' ') sys.stdout.write(d.string_values[0][1].encode('utf-8')) sys.stdout.write('\n')
def get_testdata(labels): testdata = [] testlabels = [] for index in range(n_train, (n_train + n_test)): imgfile = "{}.jpg" img = os.path.join(dir, imgfile.format(index)) with open(img, "rb") as f: binary = f.read() d = Datum() d.add_binary("image", binary) testdata.append([d]) testlabels.append(labels[index][1]) print("num of test data :", len(testdata)) return testdata
def get_testdata(labels): testdata = [] testlabels = [] for index in range(n_train,(n_train+n_test)): imgfile = "{}.jpg" img = os.path.join(dir, imgfile.format(index)) with open(img,"rb") as f: binary = f.read() d = Datum() d.add_binary("image",binary) testdata.append([d]) testlabels.append(labels[index][1]) print ("num of test data :",len(testdata)) return testdata
def get_traindata(labels, data): traindata = [] with open(data, "rb") as f: for i in range(len(labels)): binary = f.read(28 * 28) d = Datum() d.add_binary("image", binary) traindata.append([labels[i], d]) # reader = csv.reader(f) # for i,row in enumerate(reader): # print row # d = Datum() # d.add_binary("image",row) # traindata.append([labels[i],d]) print "num of train data :", len(traindata) return traindata
def test_get_core_members_light(self): for i in range(0, 100): d = Datum({"nkey1": i, "nkey2": -i}) self.cli.push([IndexedPoint(str(i), d)]) res = self.cli.get_core_members_light() self.assertEqual(len(res), 10) self.assertTrue(isinstance(res[0][0], WeightedIndex))
def test_get_k_center(self): for i in range(0, 100): d = Datum({"nkey1": i, "nkey2": -i}) self.cli.push([IndexedPoint(str(i), d)]) res = self.cli.get_k_center() self.assertEqual(len(res), 10) self.assertTrue(isinstance(res[0], Datum))
def test_decode_row(self): self.cli.clear_row("decode_row") d = Datum({"skey1": "val1", "skey2": "val2", "nkey1": 1.0, "nkey2": 2.0}) self.cli.update_row("decode_row", d) decoded_row = self.cli.decode_row("decode_row") self.assertEqual(json.dumps(d.string_values), json.dumps(decoded_row.string_values)) self.assertEqual(json.dumps(d.num_values), json.dumps(decoded_row.num_values))
def test_pack(self): self.assertEqual( msgpack.packb(([['name', 'Taro']], [['age', 20.0]], [])), msgpack.packb(Datum({ 'name': 'Taro', 'age': 20 }).to_msgpack()))
def on_status(self, status): if not hasattr(status, 'text'): return if not hasattr(status, 'coordinates'): return if not status.coordinates or not 'coordinates' in status.coordinates: return loc = None for l in self.locations: coordinates = status.coordinates['coordinates'] if l.is_inside(coordinates[0], coordinates[1]): loc = l break if not loc: # Unknown location return hashtags = status.entities['hashtags'] detagged_text = remove_hashtags_from_tweet(status.text, hashtags) # Create datum for Jubatus d = Datum({'text': detagged_text}) # Send training data to Jubatus self.classifier.train([(loc.name, d)]) # Print trained tweet print_green(loc.name, ' ') print(detagged_text)
def add_data(num=10): data = generate_data(num) client = jubatus.Anomaly(HOST, PORT_P, NAME) for d in data: dt = Datum({"x": d[0], "y": d[1], "z": d[2]}) result = client.add(dt) print('Added {0}, score = {1}'.format(result.id, result.score))
def predict(self, client): getpre = preMongo() dic_pre = getpre.getDic() data = [] predict_result = {} for line in dic_pre: value = dic_pre[line]['Value'] data.append(Datum({'Value':value})) for d in data: res = client.classify([d]) # getmongo.postDB(max(res[0], key=lambda x: x.score).label, str(d.num_values[0][1])) # sys.stdout.write(max(res[0], key=lambda x: x.score).label) # sys.stdout.write(' ') # sys.stdout.write(str(d.num_values[0][1])) # sys.stdout.write('\n') hoge = str(d.num_values[0][1]) result = max(res[0], key=lambda x: x.score).label predict_result.update({'Result': result, 'Value' : hoge}) return predict_result['Result']
def main(): args = parse_options() client = Classifier('127.0.0.1', args.port, 'test', 0) for i in range(0, 1000000): d = Datum() # Learn same data rand = random.randint(0, 1) d.add_number('key', 1.0 if rand else 2.0) ld = LabeledDatum('Pos' if rand else 'Neg', d) client.train([ld]) if not i % 10000: print 'train ' + str(i) + ' data'
def test_add_string(self): d = Datum() d.add_string('key', 'value') self.assertEquals(Datum({'key': 'value'}).to_msgpack(), d.to_msgpack()) d = Datum() d.add_string(u'key', u'value') self.assertEquals(Datum({'key': 'value'}).to_msgpack(), d.to_msgpack())
def test_str(self): d = Datum() d.add_string('name', 'john') d.add_number('age', 20) d.add_binary('image', '0101') self.assertEquals('datum{string_values: [[\'name\', \'john\']], num_values: [[\'age\', 20.0]], binary_values: [[\'image\', \'0101\']]}', str(d))
def test_str(self): d = Datum() d.add_string('name', 'john') d.add_number('age', 20) d.add_binary('image', b('0101')) s = str(d) self.assertTrue('datum{string_values: [[\'name\', \'john\']], num_values: [[\'age\', 20.0]], binary_values: [[\'image\', \'0101\']]}' == s or 'datum{string_values: [[\'name\', \'john\']], num_values: [[\'age\', 20.0]], binary_values: [[\'image\', b\'0101\']]}' == s)
def test_unpack(self): d = Datum.from_msgpack(([['name', 'Taro']], [['age', 20.0]], [['img', '0101']])) self.assertEquals( [('name', 'Taro')], d.string_values) self.assertEquals( [('age', 20.0)], d.num_values) self.assertEquals( [('img', '0101')], d.binary_values)
def main(): cl = Regression("127.0.0.1", 9199, "test") d = Datum() for i in xrange(10): d.add_number('x', 1) d.add_number('y', 4) cl.train([[10.0, d]]) d = Datum() d.add_number('x', 1) d.add_number('y', 4) result = cl.estimate([d]) print("{0:30.30f}".format(result[0]))
def _juba_proc(self, clock, datadict, method="add"): #if DEBUG: # print datadict datum = Datum() for k in datadict.keys(): #print "key:%s value:%s" % (str(k), str(datadict[k])) if k == "hostid": datum.add_number(str(k), int(datadict[k])*1.0/ZBX_ITEMID_DIGITS) elif k == "weekday" or k == "hour": datum.add_number(str(k), datadict[k]) elif k != "hostid" and k != "weekday" and k != "hour": datum.add_number(str(k), self.norm(k, datadict[k])) #print datum retry_cnt = JUBA_RETRY_MAX while True: try: if method=="add": print datum ret = self.anom.add(datum) exit() if method=="calc": print datum score = self.anom.calc_score(datum) if score == float('Inf') or score > ML_LIMIT: #print datadict if self.alarm_on == False: self.alarm_on = True cf.log("[%s] score=%f" % (cf.clock2strjst(clock), score)) else: if self.alarm_on == True: self.alarm_on = False cf.log("[%s] score recovered to normal:score=%f" % (cf.clock2strjst(clock), score)) break except (msgpackrpc.error.TransportError, msgpackrpc.error.TimeoutError) as e: retry_count -= 1 if retry_count <= 0: raise self.anom.get_client().close() self.set_anom() print e time.sleep(JUBA_RETRY_INTERVAL) continue
def main(): # cl = jubatus.NearestNeighbor("localhost", 9199, "nn") cl = client.Recommender("localhost", 9199, "test") cl.clear() d = Datum() for i in xrange(10): d.add_number('x', 1) d.add_number('y', 4) cl.update_row(str(i), d) # scores = cl.similar_row_from_id(str(0), 10) # for score in scores: # print("{0} {1:30.30f}".format(score.id, score.score)) d = Datum() d.add_number('x', 1) d.add_number('y', 4) predicts = cl.similar_row_from_datum(d, 5) print(predicts) for predict in predicts: print("{0} {1:30.30f}".format(predict.id, predict.score))
def main(): # cl = jubatus.NearestNeighbor("localhost", 9199, "nn") cl = Classifier("localhost", 9199, "test") cl.clear() d = Datum() for i in xrange(10): d.add_number('x', 1) d.add_number('y', 4) cl.train([LabeledDatum("label1", d)]) # cl.set_row(str(i), d) # scores = cl.similar_row_from_id(str(0), 10) # for score in scores: # print("{0} {1:30.30f}".format(score.id, score.score)) d = Datum() d.add_number('x', 1) d.add_number('y', 4) predict = cl.classify([d]) print(predict) for score in predict: print("{0} {1:30.30f}".format(score[0].label, score[0].score))
print('Stop running the job.') sys.exit(0) if __name__ == '__main__': # 0. set KeyboardInterrupt handler signal.signal(signal.SIGINT, do_exit) # 1. set jubatus server anom = client.Anomaly("127.0.0.1", 9199, NAME) # 2. prepare training data with open('../kddcup.data_10_percent.txt', mode='r') as file: for line in file: duration, protocol_type, service, flag, src_bytes, dst_bytes, land, wrong_fragment, urgent, hot, num_failed_logins, logged_in, num_compromised, root_shell, su_attempted, num_root, num_file_creations, num_shells, num_access_files, num_outbound_cmds, is_host_login, is_guest_login, count, srv_count, serror_rate, srv_serror_rate, rerror_rate, srv_rerror_rate, same_srv_rate, diff_srv_rate, srv_diff_host_rate, dst_host_count, dst_host_srv_count, dst_host_same_srv_rate, dst_host_diff_srv_rate, dst_host_same_src_port_rate, dst_host_srv_diff_host_rate, dst_host_serror_rate, dst_host_srv_serror_rate, dst_host_rerror_rate, dst_host_srv_rerror_rate, label = line[:-1].split(",") datum = Datum() for (k, v) in [ ["protocol_type", protocol_type], ["service", service], ["flag", flag], ["land", land], ["logged_in", logged_in], ["is_host_login", is_host_login], ["is_guest_login", is_guest_login], ]: datum.add_string(k, v) for (k, v) in [ ["duration",float(duration)], ["src_bytes", float(src_bytes)], ["dst_bytes", float(dst_bytes)],
from jubatus.common import Datum import jubatus client = jubatus.Weight("127.0.0.1", 9199, "") d = Datum() d.add_number("user/age", 25) d.add_number("user/income", 1000) d.add_string("user/name", "Loren") d.add_string("message", "Hello") res = client.calc_weight(d) print(res)
import jubatus from jubatus.common import Datum import random cl = jubatus.Recommender('127.0.0.1', 9199, 'test', 0) random.seed(1) datum_length = 100 for i in range(3): d = Datum() for x in range(datum_length): d.add_number("{}".format(x), random.random()) cl.update_row(str(i), d) print('ids:{}'.format(','.join(cl.get_all_rows()))) # 1, 2, 3 d = Datum() for x in range(datum_length): d.add_number("{}".format(x), random.random()) cl.update_row('3', d) print('ids:{}'.format(','.join(cl.get_all_rows()))) # unlearn 1 id cl.save('test') cl.clear() cl.load('test') print('ids:{}'.format(','.join(cl.get_all_rows()))) # should be same as before `save` d = Datum()
def gen_datum(filename): with open(filename,"rb") as f: binary = f.read() d = Datum() d.add_binary("image", binary) return d
#-*- coding:utf-8 -*- import jubatus from jubatus.common import Datum """ 画像特徴抽出プラグインを試す """ # ファイルをバイナリモードで開く with open("./dataset/Lenna.jpg", "br") as f: data = f.read() d = Datum() d.add_binary("image", data) client = jubatus.Weight("127.0.0.1", 9199, "") res = client.calc_weight(d) print(res)
def make_datum(): d = Datum() d.add_string('string-key', 'str') d.add_number('number-key', 1.0) d.add_binary('binary-key', b'bin') return d
import sys, json from jubatus.clustering import client from jubatus.clustering import types from jubatus.common import Datum NAME = "clustering_compounds" if __name__ == '__main__': clustering = client.Clustering("127.0.0.1", 9199, NAME) for line in open("../../bench_data/demo4096.smi"): smiles, id = line.split(" ") datum = Datum() datum.add_string("SMILES", smiles) clustering.push([datum]) center_list = clustering.get_k_center() members = clustering.get_core_members() for i in range(0,4): for j in range(len(members[i])): print "%d, %d, %s " %(i, j, members[i][j]) # print "%s \n" % center_list[4] # for i in range(len(center_list)): # print "%s \n" % center_list[i]
#!/usr/bin/env python import random import time from jubatus.classifier.client import Classifier from jubatus.classifier.types import LabeledDatum from jubatus.common import Datum data = [] for i in xrange(0, 100000): d = Datum() for j in xrange(0, 20): d.add_number(str(j) + "-" + str(i), random.random() + 1.0) ld = LabeledDatum("Pos" if random.randint(0, 1) else "Neg", d) data.append(ld) client = Classifier("127.0.0.1", 9199, "test", 0) start_time = time.time() client.train(data) end_time = time.time() print str(len(data)) + " ... " + str((end_time - start_time) * 1000) + " msec"
def test_add_binary(self): d = Datum() d.add_binary('key', b('value')) self.assertEquals( ([], [], [['key', b('value')]]), d.to_msgpack())
def test_add_int(self): d = Datum() d.add_number('key', 1) self.assertEquals(Datum({'key': 1.0}).to_msgpack(), d.to_msgpack())
con = MongoClient('172.16.4.84', 27017) db = con.sensordb col = db.anomaly # 1.Jubatus Serverへの接続設定 anom = client.Anomaly("127.0.0.1",9199,NAME) # 2.学習用データの準備 mongo_dic = convertMongo() dic = mongo_dic.getDic() name = '' value = 0 for line in dic: name = dic[line]['name'] value = dic[line]['value'] datum = Datum() # for (k, v) in [ # ['name', name], # ]: # datum.add_string(k, v) for (k, v) in [ ['value', value], ]: datum.add_number(k, v) # 3.データの学習(学習モデルの更新) ret = anom.add(datum) # 4.結果の出力
import sys, json from jubatus.clustering import client from jubatus.clustering import types from jubatus.common import Datum NAME = "clustering_compounds" if __name__ == '__main__': clustering = client.Clustering("127.0.0.1", 9199, NAME) datum = Datum() datum.add_string("SMILES", "cccccccc") print clustering.get_nearest_center(datum) # print "%s \n" % center_list[4] # for i in range(len(center_list)): # print "%s \n" % center_list[i]
#-*- coding: utf-8 -*- import json, sys import jubatus from jubatus.common import Datum headlines = {} #keys = ["HeadLine", "DateLine", "Language", "DateId", "NewsItemId", "article", "Genre1", "Genre2"] with open(sys.argv[1], "r") as f: client = jubatus.Recommender("127.0.0.1", 9199, "hoge", 0) feeds = json.load(f, encoding="utf-8") for feed in feeds: d = Datum() keys = list(feed.keys()) headlines[feed["NewsItemId"]] = feed["HeadLine"] for key in keys: try: if key == "article": d.add_string(key, " ".join(feed[key])) elif key == "NewsItemId": article_id = feed[key].encode('utf-8') elif key == "HeadLine": d.add_string(key, feed[key].encode('utf-8')) else: d.add_string(key, feed[key].encode('utf-8')) except TypeError: print("ignore", key, " ".join(feed[key])) except AttributeError: print("ignore", key, feed[key]) client.update_row(article_id, d) res = client.similar_row_from_id(article_id, 10) client.save("jubatus_hackathon")