def __init__(self, host="localhost", port=8080, sname="", description="", mllib="caffe", service_parameters_input=None, service_parameters_mllib=None, service_parameters_output=None, model=None, tmp_dir=None): self.host = host self.port = port self.sname = sname self.model = model self.description = description self.mllib = mllib self.tmp_dir = tmp_dir self.service_parameters_input = service_parameters_input self.service_parameters_mllib = service_parameters_mllib self.service_parameters_output = service_parameters_output self.n_pred = 0 self.n_fit = 0 self.calls = [] self.answers = [] # self.train_logs = None super(AbstractModels, self).__init__(self.host, self.port) if self.sname: for service in self.get_info()['head']['services']: if service['name'] == self.sname.lower( ): # DD lowercases services' name self.delete_service(self.sname, clear="mem") else: self.sname = "pyDD_{}".format(time_utils.fulltimestamp()) self.description = self.sname # Check if a repository is given otherwise creates one if "repository" not in self.model or not self.model["repository"]: self.repository = tempfile.mkdtemp(prefix="pydd_", dir=self.tmp_dir) self.model["repository"] = self.repository os_utils._create_dirs([self.model["repository"]]) else: assert os.path.exists( self.model["repository"]), "{} does not exist".format( self.model["repository"]) json_dump = self.create_service(self.sname, self.model, self.description, self.mllib, self.service_parameters_input, self.service_parameters_mllib, self.service_parameters_output) self.answers.append(json_dump) with open("{}/model.json".format(self.model["repository"])) as f: self.calls = [json.loads(line, encoding="utf-8") for line in f]
class XGB(AbstractModels): """ XGBoost # General parameters: Parameter Type Optional Default Description objective string yes multi:softprob objective function, among multi:softprob, binary:logistic, reg:linear, reg:logistic booster string yes gbtree which booster to use, gbtree or gblinear num_feature int yes set by xgbbost maximum dimension of the feature eval_metric string yes obj dependant evaluation metric internal to xgboost base_score double yes 0.5 initial prediction score, global bias seed int yes 0 random number seed iterations int no N/A number of boosting iterations test_interval int yes 1 number of iterations between each testing pass save_period int yes 0 number of iterations between model saving to disk # Booster parameters Parameter Type Optional Default Description eta double yes 0.3 step size shrinkage gamma double yes 0 minimum loss reduction max_depth int yes 6 maximum depth of a tree min_child_weight int yes 1 minimum sum of instance weight max_delta_step int yes 0 maximum delta step subsample double yes 1.0 subsample ratio of traning instance colsample double yes 1.0 subsample ratio of columns when contructing each tree lambda double yes 1.0 L2 regularization term on weights alpha double yes 0.0 L1 regularization term on weights lambda_bias double yes 0.0 L2 regularization for linear booster tree_method string yes auto tree construction algorithm, from auto, exact, approx # Example calls curl -X PUT "http://localhost:8080/services/affxgb" -d "{\"mllib\":\"xgboost\",\"description\":\"classification service\",\"type\":\"supervised\",\"parameters\":{\"input\":{\"connector\":\"svm\"},\"mllib\":{\"nclasses\":2}},\"model\":{\"repository\":\"/path/to/model\"}}" curl -X POST "http://localhost:8080/train" -d "{\"service\":\"testxgb\",\"async\":true,\"parameters\":{\"mllib\":{\"iterations\":100,\"test_interval\":10,\"objective\":\"binary:logistic\",\"booster_params\":{\"max_depth\":30}},\"input\":{},\"output\":{\"measure\":[\"auc\",\"mcll\",\"f1\"]}},\"data\":[\"/path/to/X_train.svm\",\"/path/to/X_test.svm\"]}" """ def __init__(self, host="localhost", port=8080, sname="", description="", repository="", connector="svm", mllib="xgboost", nclasses=2, ntargets=None, tmp_dir=None): self.host = host self.port = port self.sname = sname self.mllib = mllib self.description = description self.repository = repository self.connector = connector self.nclasses = nclasses self.ntargets = ntargets self.tmp_dir = tmp_dir self.model = {"repository": self.repository} self.service_parameters_mllib = {"nclasses": self.nclasses, "ntargets": self.ntargets} self.service_parameters_input = {"connector": self.connector} self.service_parameters_output = {} super(XGB, self).__init__(host=self.host, port=self.port, sname=self.sname, description=self.description, mllib=self.mllib, service_parameters_input=self.service_parameters_input, service_parameters_output=self.service_parameters_output, service_parameters_mllib=self.service_parameters_mllib, model=self.model, tmp_dir=self.tmp_dir) def fit(self, train_data, validation_data=[], objective="multi:softprob", booster="gbtree", eval_metric="auc", base_score=0.5, seed=0, nthread=-1, iterations=10, test_interval=1, save_period=0, metrics=["auc", "acc"], async=True, display_metric_interval=1, **booster_params): # df: True otherwise core dump when training on svm data self.train_parameters_input = {}, self.train_parameters_output = {"measure": metrics}, self.train_parameters_mllib = { "objective": objective, "booster": booster, "nthread": nthread, "eval_metric": eval_metric, "base_score": base_score, "seed": seed, "iterations": iterations, "test_interval": test_interval, "save_period": save_period, "booster_params": booster_params, } self.data = [] if train_data.name == "svm": self.data.append(train_data.path) if validation_data: for connector in validation_data: self.data.append(connector.path) elif train_data.name == "array": train_f = os.path.join(self.repository, "x_train_{}.svm".format(time_utils.fulltimestamp())) dump_svmlight_file(train_data.X, train_data.Y, train_f) self.data.append(train_f) if validation_data: for i, connector in enumerate(validation_data): valid_f = os.path.join(self.repository, "x_val{}_{}.svm".format(i, time_utils.fulltimestamp())) dump_svmlight_file(connector.X, connector.Y, valid_f) self.data.append(valid_f) self.train_logs = self._train(self.data, self.train_parameters_input, self.train_parameters_mllib, self.train_parameters_output, display_metric_interval, async) return self.train_logs
def __init__(self, host='localhost', port=8080, sname='', mllib='caffe', description='', repository='', templates='../templates/caffe', connector='svm', nclasses=None, ntargets=None, gpu=False, gpuid=0, template='mlp', layers=[50], activation='relu', dropout=0.5, regression=False, finetuning=False, db=True): self.host = host self.port = port self.sname = sname self.mllib = mllib self.description = description self.repository = repository self.templates = templates self.connector = connector self.nclasses = nclasses self.ntargets = ntargets self.gpu = gpu self.gpuid = gpuid self.template = template self.layers = layers self.activation = activation self.dropout = dropout self.regression = regression self.finetuning = finetuning self.db = db self.params = { 'host': self.host, 'port': self.port, 'sname': self.sname, 'mllib': self.mllib, 'description': self.description, 'repository': self.repository, 'templates': self.templates, 'connector': self.connector, 'nclasses': self.nclasses, 'ntargets': self.ntargets, 'gpu': self.gpu, 'gpuid': self.gpuid, 'template': self.template, 'layers': self.layers, 'activation': self.activation, 'dropout': self.dropout, 'regression': self.regression, 'finetuning': self.finetuning, 'db': self.db, } self.n_pred = 0 self.n_fit = 0 self.calls = [] self.answers = [] self.model = { 'templates': self.templates, 'repository': self.repository } self.service_parameters_mllib = { 'nclasses': self.nclasses, 'ntargets': self.ntargets, 'gpu': self.gpu, 'gpuid': self.gpuid, 'template': self.template, 'layers': self.layers, 'activation': self.activation, 'dropout': self.dropout, 'regression': self.regression, 'finetuning': self.finetuning, 'db': self.db } self.service_parameters_input = {'connector': self.connector} self.service_parameters_output = {} super(genericMLP, self).__init__(self.host, self.port) if self.sname == '': self.sname = "pyDD_MLP_{}".format(time_utils.fulltimestamp()) self.description = self.sname else: self.delete_service(self.sname, "mem") tmp_dir = tempfile.mkdtemp() self.data_folder = "{}/data".format(tmp_dir) if self.model['repository'] == '': self.model['repository'] = "{}/model".format(tmp_dir) os_utils._create_dirs([self.model['repository'], self.data_folder]) json_dump = self.create_service(self.sname, self.model, self.description, self.mllib, self.service_parameters_input, self.service_parameters_mllib, self.service_parameters_output) self.answers.append(json_dump) with open("{}/model.json".format(self.model['repository'])) as f: self.calls = [json.loads(line, encoding='utf-8') for line in f]
if validation_data: if len(validation_data) == 1: test_data = validation_data[0] self.data.append(test_data.path) if test_data.lmdb_path: os.symlink(test_data.lmdb_path, os.path.join(self.repository, "test.lmdb")) else: for i, connector in enumerate(validation_data): self.data.append(connector.path) if connector.lmdb_path: os.symlink(connector.lmdb_path, os.path.join(self.repository, "test_{}.lmdb".format(i+1))) elif train_data.name == "array": train_f = os.path.join(self.repository, "x_train_{}.svm".format(time_utils.fulltimestamp())) dump_svmlight_file(train_data.X, train_data.Y, train_f) self.data.append(train_f) if validation_data: for i, conn in enumerate(validation_data): valid_f = os.path.join(self.repository, "x_val{}_{}.svm".format(i, time_utils.fulltimestamp())) dump_svmlight_file(conn.X, conn.Y, valid_f) self.data.append(valid_f) self.train_logs = self._train(self.data, self.train_parameters_input, self.train_parameters_mllib, self.train_parameters_output, display_metric_interval, async) if train_data.lmdb_path:
def fit(self, X, Y=None, validation_data=[], iterations=100, test_interval=None, solver_type='SGD', base_lr=0.1, lr_policy=None, stepsize=None, momentum=None, weight_decay=None, power=None, gamma=None, iter_size=1, batch_size=128, metrics=['mcll', 'accp'], class_weights=None): self.filepaths = [] if type(X) == np.ndarray or sparse.issparse(X): train_f = os.path.join( self.data_folder, "x_train_{}.svm".format(time_utils.fulltimestamp())) dump_svmlight_file(X, Y, train_f) self.filepaths.append(train_f) if len(validation_data) > 0: for i, (x_val, y_val) in enumerate(validation_data): valid_f = os.path.join( self.data_folder, "x_val{}_{}.svm".format(i, time_utils.fulltimestamp())) dump_svmlight_file(x_val, y_val, valid_f) self.filepaths.append(valid_f) elif type(X) == list: self.filepaths = X elif type(X) == str: self.filepaths = [X] else: raise # df: True otherwise core dump when training on svm data self.train_parameters_input = {'db': True}, self.train_parameters_output = {"measure": metrics}, self.train_parameters_mllib = { 'gpu': self.service_parameters_mllib['gpu'], 'solver': { 'iterations': iterations, 'test_interval': test_interval, 'base_lr': base_lr, 'solver_type': solver_type, 'lr_policy': lr_policy, 'stepsize': stepsize, 'momentum': momentum, 'weight_decay': weight_decay, 'power': power, 'gamma': gamma, 'iter_size': iter_size }, 'net': { 'batch_size': batch_size }, 'class_weights': class_weights if class_weights else [1.] * self.service_parameters_mllib['nclasses'] } if self.n_fit > 0: self.delete_service(self.sname, "mem") if 'template' in self.service_parameters_mllib: self.service_parameters_mllib.pop('template') self.create_service(self.sname, self.model, self.description, self.mllib, self.service_parameters_input, self.service_parameters_mllib, self.service_parameters_output) json_dump = self.post_train(self.sname, self.filepaths, self.train_parameters_input, self.train_parameters_mllib, self.train_parameters_output, async=True) time.sleep(1) self.answers.append(json_dump) with open("{}/model.json".format(self.model['repository'])) as f: self.calls = [json.loads(line, encoding='utf-8') for line in f] self.n_fit += 1 train_status = '' while True: train_status = self.get_train(self.sname, job=1, timeout=2) if train_status['head']['status'] == 'running': print(train_status['body']['measure']) else: print(train_status) break