def _fit_local(params, model_factory, list_of_parts, worker_addresses, return_model, local_listen_port=12400, listen_time_out=120, **kwargs): network_params = build_network_params(worker_addresses, get_worker().address, local_listen_port, listen_time_out) params = {**params, **network_params} # Prepare data if len(list_of_parts[0]) == 3: data, labels, weight = zip(*list_of_parts) weight = concat(weight) else: data, labels = zip(*list_of_parts) weight = None data = concat(data) # Concatenate many parts into one labels = concat(labels) try: classifier = model_factory(**params) classifier.fit(data, labels, sample_weight=weight) finally: _safe_call(_LIB.LGBM_NetworkFree()) if return_model: return classifier else: return None
def _train_part(params, model_factory, list_of_parts, worker_addresses, return_model, local_listen_port=12400, time_out=120, **kwargs): network_params = build_network_params(worker_addresses, get_worker().address, local_listen_port, time_out) params.update(network_params) # Concatenate many parts into one parts = tuple(zip(*list_of_parts)) data = concat(parts[0]) label = concat(parts[1]) weight = concat(parts[2]) if len(parts) == 3 else None try: model = model_factory(**params) model.fit(data, label, sample_weight=weight) finally: _safe_call(_LIB.LGBM_NetworkFree()) return model if return_model else None
def _train_part(params, model_factory, list_of_parts, worker_addresses, return_model, local_listen_port=12400, time_out=120, **kwargs): network_params = build_network_params(worker_addresses, get_worker().address, local_listen_port, time_out) params.update(network_params) # Concatenate many parts into one parts = tuple(zip(*list_of_parts)) data = concat(parts[kwargs['parts_list'].index('X')]) label = concat(parts[kwargs['parts_list'].index('y')]) weight = concat(parts[kwargs['parts_list'].index( 'weight')]) if 'weight' in kwargs['parts_list'] else None valid_X = concat(parts[kwargs['parts_list'].index( 'valid_X')]) if 'valid_X' in kwargs['parts_list'] else None valid_y = concat(parts[kwargs['parts_list'].index( 'valid_y')]) if 'valid_y' in kwargs['parts_list'] else None eval_sample_weight = concat( parts[kwargs['parts_list'].index('eval_sample_weight')] ) if 'eval_sample_weight' in kwargs['parts_list'] else None # only first eval_set supported kwargs = kwargs.copy() # avoid contaminating upstream if valid_X is not None and valid_y is not None: kwargs['eval_set'] = [(valid_X, valid_y)] kwargs['eval_sample_weight'] = [eval_sample_weight] kwargs.pop('parts_list', None) try: model = model_factory(**params) model.fit(data, label, sample_weight=weight, **kwargs) finally: _safe_call(_LIB.LGBM_NetworkFree()) return model if return_model else None
def _fit_local(params, model_factory, list_of_parts, worker_addresses, local_listen_port=12400, listen_time_out=120, **kwargs): network_params = build_network_params(worker_addresses, get_worker().address, local_listen_port, listen_time_out) params = {**params, **network_params} data, labels = zip(*list_of_parts) # Prepare data data = concat(data) # Concatenate many parts into one labels = concat(labels) try: classifier = model_factory(**params) classifier.fit(data, labels) finally: _safe_call(_LIB.LGBM_NetworkFree()) return classifier
def execute(cls, ctx, op: "LGBMTrain"): if op.merge: return super().execute(ctx, op) from lightgbm.basic import _safe_call, _LIB data_val = ctx[op.data.key] data_val = data_val.spmatrix if hasattr(data_val, 'spmatrix') else data_val label_val = ctx[op.label.key] sample_weight_val = ctx[ op.sample_weight.key] if op.sample_weight is not None else None init_score_val = ctx[ op.init_score.key] if op.init_score is not None else None if op.eval_datas is None: eval_set, eval_sample_weight, eval_init_score = None, None, None else: eval_set, eval_sample_weight, eval_init_score = [], [], [] for data, label in zip(op.eval_datas, op.eval_labels): data_eval = ctx[data.key] data_eval = data_eval.spmatrix if hasattr( data_eval, 'spmatrix') else data_eval eval_set.append((data_eval, ctx[label.key])) for weight in op.eval_sample_weights: eval_sample_weight.append( ctx[weight.key] if weight is not None else None) for score in op.eval_init_scores: eval_init_score.append( ctx[score.key] if score is not None else None) eval_set = eval_set or None eval_sample_weight = eval_sample_weight or None eval_init_score = eval_init_score or None params = op.params.copy() # if model is trained, remove unsupported parameters params.pop('out_dtype_', None) if ctx.running_mode == RunningMode.distributed: worker_ports = ctx[op.worker_ports.key] worker_ips = [worker.split(':', 1)[0] for worker in op.workers] worker_endpoints = [ f'{worker}:{port}' for worker, port in zip(worker_ips, worker_ports) ] params['machines'] = ','.join(worker_endpoints) params['time_out'] = op.timeout params['num_machines'] = len(worker_endpoints) params['local_listen_port'] = worker_ports[op.worker_id] if (op.tree_learner or '').lower() not in {'data', 'feature', 'voting'}: logger.warning( 'Parameter tree_learner not set or set to incorrect value ' f'{op.tree_learner}, using "data" as default') params['tree_learner'] = 'data' else: params['tree_learner'] = op.tree_learner try: model_cls = get_model_cls_from_type(op.model_type) model = model_cls(**params) model.fit(data_val, label_val, sample_weight=sample_weight_val, init_score=init_score_val, eval_set=eval_set, eval_sample_weight=eval_sample_weight, eval_init_score=eval_init_score, **op.kwds) if op.model_type == LGBMModelType.RANKER or \ op.model_type == LGBMModelType.REGRESSOR: model.set_params(out_dtype_=np.dtype('float')) elif hasattr(label_val, 'dtype'): model.set_params(out_dtype_=label_val.dtype) else: model.set_params(out_dtype_=label_val.dtypes[0]) ctx[op.outputs[0].key] = pickle.dumps(model) finally: _safe_call(_LIB.LGBM_NetworkFree())
def execute(cls, ctx, op: "LGBMTrain"): if op.merge: return super().execute(ctx, op) from lightgbm.basic import _safe_call, _LIB data_val = ctx[op.data.key] data_val = data_val.spmatrix if hasattr(data_val, "spmatrix") else data_val label_val = ctx[op.label.key] sample_weight_val = (ctx[op.sample_weight.key] if op.sample_weight is not None else None) init_score_val = ctx[ op.init_score.key] if op.init_score is not None else None if op.eval_datas is None: eval_set, eval_sample_weight, eval_init_score = None, None, None else: eval_set, eval_sample_weight, eval_init_score = [], [], [] for data, label in zip(op.eval_datas, op.eval_labels): data_eval = ctx[data.key] data_eval = (data_eval.spmatrix if hasattr( data_eval, "spmatrix") else data_eval) eval_set.append((data_eval, ctx[label.key])) for weight in op.eval_sample_weights: eval_sample_weight.append( ctx[weight.key] if weight is not None else None) for score in op.eval_init_scores: eval_init_score.append( ctx[score.key] if score is not None else None) eval_set = eval_set or None eval_sample_weight = eval_sample_weight or None eval_init_score = eval_init_score or None params = op.params.copy() # if model is trained, remove unsupported parameters params.pop("out_dtype_", None) worker_ports = ctx[op.worker_ports.key] worker_ips = [worker.split(":", 1)[0] for worker in op.workers] worker_endpoints = [ f"{worker}:{port}" for worker, port in zip(worker_ips, worker_ports) ] params["machines"] = ",".join(worker_endpoints) params["time_out"] = op.timeout params["num_machines"] = len(worker_endpoints) params["local_listen_port"] = worker_ports[op.worker_id] if (op.tree_learner or "").lower() not in {"data", "feature", "voting"}: logger.warning( "Parameter tree_learner not set or set to incorrect value " f'{op.tree_learner}, using "data" as default') params["tree_learner"] = "data" else: params["tree_learner"] = op.tree_learner try: model_cls = get_model_cls_from_type(op.model_type) model = model_cls(**params) model.fit( data_val, label_val, sample_weight=sample_weight_val, init_score=init_score_val, eval_set=eval_set, eval_sample_weight=eval_sample_weight, eval_init_score=eval_init_score, **op.kwds, ) if (op.model_type == LGBMModelType.RANKER or op.model_type == LGBMModelType.REGRESSOR): model.set_params(out_dtype_=np.dtype("float")) elif hasattr(label_val, "dtype"): model.set_params(out_dtype_=label_val.dtype) else: model.set_params(out_dtype_=label_val.dtypes[0]) ctx[op.outputs[0].key] = pickle.dumps(model) finally: _safe_call(_LIB.LGBM_NetworkFree())
def execute(cls, ctx, op: "LGBMTrain"): if op.merge: return super().execute(ctx, op) from lightgbm.basic import _safe_call, _LIB data_val = ctx[op.data.key] label_val = ctx[op.label.key] sample_weight_val = ctx[ op.sample_weight.key] if op.sample_weight is not None else None init_score_val = ctx[ op.init_score.key] if op.init_score is not None else None if op.eval_datas is None: eval_set, eval_sample_weight, eval_init_score = None, None, None else: eval_set, eval_sample_weight, eval_init_score = [], [], [] for data, label in zip(op.eval_datas, op.eval_labels): eval_set.append((ctx[data.key], ctx[label.key])) for weight in op.eval_sample_weights: eval_sample_weight.append( ctx[weight.key] if weight is not None else None) for score in op.eval_init_scores: eval_init_score.append( ctx[score.key] if score is not None else None) eval_set = eval_set or None eval_sample_weight = eval_sample_weight or None eval_init_score = eval_init_score or None params = op.params.copy() if ctx.running_mode == RunningMode.distributed: params['machines'] = ','.join(op.lgbm_endpoints) params['time_out'] = op.timeout params['num_machines'] = len(op.lgbm_endpoints) params['local_listen_port'] = op.lgbm_port if (op.tree_learner or '').lower() not in {'data', 'feature', 'voting'}: logger.warning( 'Parameter tree_learner not set or set to incorrect value %s, ' 'using "data" as default' % op.tree_learner) params['tree_learner'] = 'data' else: params['tree_learner'] = op.tree_learner try: model_cls = get_model_cls_from_type(op.model_type) model = model_cls(**params) model.fit(data_val, label_val, sample_weight=sample_weight_val, init_score=init_score_val, eval_set=eval_set, eval_sample_weight=eval_sample_weight, eval_init_score=eval_init_score, **op.kwds) if hasattr(label_val, 'dtype'): model.set_params(out_dtype_=label_val.dtype) else: model.set_params(out_dtype_=label_val.dtypes[0]) ctx[op.outputs[0].key] = pickle.dumps(model) finally: _safe_call(_LIB.LGBM_NetworkFree())