def save(self, dirname): self._check_if_fitted() mkdir(dirname) save_tree(self.tree, self.node_to_class, self.node_to_classes, self.class_maps, join(dirname, 'tree')) models_dct = {} models_dirname = join(dirname, 'models') mkdir(models_dirname) for node_id, model in self.models.items(): fname = f'model{node_id}' model.save(join(models_dirname, fname)) models_dct[node_id] = fname with open(join(models_dirname, 'models_fnames.yaml'), 'w', encoding='utf-8') as file: yaml_dump(models_dct, file) encoders_dct = {} encoders_dirname = join(dirname, 'encoders') mkdir(encoders_dirname) for node_id, encoder in self.encoders.items(): fname = f'encoder{node_id}.sav' joblib_dump(encoder, join(encoders_dirname, fname)) encoders_dct[node_id] = fname with open(join(encoders_dirname, 'encoders_fnames.yaml'), 'w', encoding='utf-8') as file: yaml_dump(encoders_dct, file)
def main(): print("[INFO] Start train script...") print("[INFO] Loading the data...") X, y = load_data() print("[INFO] Training the model...") pipeline = get_model_pipeline() pipeline.fit(X, y) print("[INFO] Saving the model pipeline artifact...") joblib_dump(pipeline, 'model/model_v1.joblib') print("[INFO] train script ended")
def saveDependencies(cls, model, filename, joblib=False): # Find and export the Python version and Library version lib = [requirement for requirement in freeze(local_only=True)] dependencies = dict([(li.split('==')[0], li.split('==')[1]) for li in lib]) with open('dependencies.json', 'w') as outfile: json.dump(dependencies, outfile) if joblib: # serialize with Numpy arrays by using library joblib joblib_dump(model, filename) else: # serialize standard Python objects dump(model, open(filename, 'wb'))
def train_one_model_per_frequency(model_dirname, frequency, num_epochs=DEFAULT_EPOCHS, batch_size=DEFAULT_BATCH_SIZE): model_frequency_dirname = os_path_join(model_dirname, 'k_'+str(frequency)) os_mkdir(model_frequency_dirname) print('train_one_model_per_frequency. model_dirname={}, frequency={}'.format(model_dirname, frequency)) X_all, y_all = get_training_data() X_all_frequency, y_all_frequency = X_all[:, frequency, :, :], y_all[:, frequency, :, :] print('train_one_model_per_frequency: got data') (X_frequency_train, X_frequency_valid, _), (y_frequency_train, y_frequency_valid, _) = split_examples(X_all_frequency, y_all_frequency) X_frequency_train = X_frequency_train.reshape(-1, X_frequency_train.shape[-2] * X_frequency_train.shape[-1]) X_frequency_valid = X_frequency_valid.reshape(-1, X_frequency_valid.shape[-2] * X_frequency_valid.shape[-1]) y_frequency_train = y_frequency_train.reshape(-1, y_frequency_train.shape[-2] * y_frequency_train.shape[-1]) y_frequency_valid = y_frequency_valid.reshape(-1, y_frequency_valid.shape[-2] * y_frequency_valid.shape[-1]) print('X_frequency_train.shape =', X_frequency_train.shape) print('y_frequency_train.shape =', y_frequency_train.shape) min_max_scaler = MinMaxScaler() model = KerasRegressor(build_fn=get_model_keras, epochs=num_epochs, batch_size=batch_size, verbose=1) estimators = [] # estimators.append(('standardize', StandardScaler())) estimators.append(('standardize', min_max_scaler)) estimators.append(('mlp', model)) pipeline = Pipeline(estimators) # kfold = KFold(n_splits=10, random_state=DEFAULT_RANDOM_SEED) print('train_one_model_per_frequency: begin training frequency={}'.format(frequency)) # results = cross_val_score(pipeline, X_frequency_train, y_frequency_train, cv=kfold, verbose=1, error_score='raise') # print("Larger: %.4f (%.4f) MSE" % (results.mean(), results.std())) # Export the regressor to a file pipeline.fit(X_frequency_train, y_frequency_train) model_k_save_path = os_path_join(model_frequency_dirname, MODEL_DATA_SAVE_FNAME) joblib_dump(pipeline, model_k_save_path) prediction = pipeline.predict(X_frequency_valid) print('mean_squared_error(y_valid, prediction) =', mean_squared_error(y_frequency_valid, prediction))
def serialize(filename, obj, format=DEFAULT_FORMAT): if format & JOBLIB_FORMAT: if not has_joblib: raise RuntimeError( 'Missing library. Format (JOBLIB_FORMAT) not available.') joblib_dump(obj, filename, compress=3 if format & BZIP2_FORMAT else 0) return if format & BZIP2_FORMAT: open_fn = bz2.BZ2File else: open_fn = open with open_fn(filename, 'wb') as f: if format & PICKLE_FORMAT: pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) elif format & YAML_FORMAT: if not has_yaml: raise RuntimeError( 'Missing library. Format (YAML_FORMAT) not available.') yaml.dump(obj, stream=f) else: raise ValueError('Unknown format value.')
def write_graph_stats_neig_lists(G, inputs): out_comp_nm = inputs['dir_nm'] + inputs['out_comp_nm'] n_nodes_net = nx.number_of_nodes(G) n_edges_net = nx.number_of_edges(G) with open(out_comp_nm + '_metrics.out', "a") as fid: print("No. of nodes in network = ", n_nodes_net, file=fid) print("No. of edges in network = ", n_edges_net, file=fid) myGraphdict = nx.to_dict_of_dicts(G) folNm = inputs['dir_nm'] + inputs['graph_files_dir'] + "/neig_dicts" if not os_path.exists(folNm): os_mkdir(folNm) neig_lens = [] for node, val in myGraphdict.items(): with open(folNm + "/" + node, 'wb') as f: joblib_dump(val, f) neig_lens.append(len(val)) with open(out_comp_nm + '_metrics.out', "a") as fid: print("Max number of neighbors = ", max(neig_lens), file=fid) print("Avg number of neighbors = %.2f " % np_mean(neig_lens), file=fid) logging_info("Finished writing neighbor lists.")
pred = round(clf.predict(X=X_test), 2) pred # In[ ]: pred # In[ ]: savetxt(fname='pred.csv', X=pred, fmt='%f', delimiter=',') # # Сохранение модели # In[ ]: joblib_dump(value=clf, filename='model.pkl', compress=9) joblib_dump(value=enc_categ, filename='enc_categ.pkl', compress=9) joblib_dump(value=enc_text, filename='enc_text.pkl', compress=9) # In[ ]: valid_sets = { 'location': list(set(data_train[strCols[0]].value_counts(dropna=False).index)), 'contract': list(set(data_train[strCols[1]].value_counts(dropna=False).index)) } with open('valid_sets.json', 'w') as fp: dump(valid_sets, fp)
def dump_model_joblib(self, model, name): model_path = os.path.join(self.output_dir, name + ".joblib") joblib_dump(model, model_path)