def which_load(training_paths, validation_paths): training_results = ['result', 'result1', 'result2', 'result3', 'result10', 'result11', 'result12', 'result13'] validation_results = ['result4', 'result5', 'result6', 'result7', 'result14', 'result15', 'result16', 'result17'] for result, pickle_path in training_results, training_paths: load(result, pickle_path) for result, pickle_path in validation_results, validation_paths: load(result, pickle_path) '''load('result', 'resized_training_set/circle.pkl') load('result1', 'resized_training_set/triangle.pkl') load('result2', 'resized_training_set/rectangle.pkl') load('result3', 'resized_training_set/square.pkl')''' #merge dataset train_data = (circle_dataset + triangle_dataset + rectangle_dataset + square_dataset + circle_dataset1 + triangle_dataset1 + rectangle_dataset1 + square_dataset1) validation_data = (valid_circle_dataset + valid_triangle_dataset + valid_rectangle_dataset + valid_square_dataset + valid_circle_dataset1 + valid_triangle_dataset1 + valid_rectangle_dataset1 + valid_square_dataset1) pickle_file = 'data_shapes.pkl' try: f = open(str(pickle_file), 'wb') save = {'train_data': train_data, 'validation_data': validation_data, } joblib.dump(save, f, compress = True) f.close() except Exception as e: print('Unable to save data to', str(pickle_file), ':', e) raise
def dump_schema(self, f): """ Dumps current schema to file w joblib """ if not self.feature_schema: raise ValueError("schema is not present") joblib.dump(self.feature_schema, f)
def _split_and_dump(self, X, y, valid_X, valid_y): if not hasattr(self, '_dm'): raise ValueError("It should be called after the dumpmanager _dm is set") if self.resampling == 'cv': pass elif self.resampling == 'holdout': if not self._has_valid_data: data_size = y.shape[0] if data_size >= 100000: valid_ratio = 0.3 elif 15000 <= data_size < 100000: valid_ratio = 0.2 else: valid_ratio = 0.15 valid_size = int(data_size * valid_ratio) X, valid_X = X[valid_size:], X[:valid_size] y, valid_y = y[valid_size:], y[:valid_size] else: raise NotImplementedError() pkl = {"resampling": self.resampling, "X": X, "y": y, "valid_X": valid_X, "valid_y": valid_y} datafile = os.path.join(self._dm.dir, "data.pkl") joblib.dump(pkl, datafile, protocol=-1) self._datafile = datafile return datafile
def save_params(self): """ Save the params to a pickle file To save the current state of the learning :return: """ joblib.dump([i.get_value() for i in self.params],'data/network.pkl')
def TrainRandomForestVariance(p_subject, p_save): print "Welcome to TrainRandomForestVariance(" + p_subject + ", " + str(p_save) + ")" training_data_raw = pd.read_pickle(input_data_paths[p_subject]) training_data = training_data_raw[["variance" in x or "classification" in x for x in training_data_raw.index]] # Ictal vs interictal forest_seizure = RandomForestClassifier(n_estimators = 500, n_jobs = 1, max_features="sqrt", max_depth=None, min_samples_split=1) y_seizure = [1 * (x > 0) for x in training_data.T["classification"]] forest_seizure.fit(training_data[:-1].T, y_seizure) # IctalA vs IctalB forest_early = RandomForestClassifier(n_estimators = 500, n_jobs = 1, max_features="sqrt", max_depth=None, min_samples_split=1) y_early = [1 * (x == 2) for x in training_data.T["classification"]] forest_early.fit(training_data[:-1].T, y_early) # Save models if p_save: saved_files = joblib.dump(forest_seizure, "RFV_" + p_subject + "_seizure.pkl") for saved_file in saved_files: os.system("mv " + saved_file + " /Users/dryu/Documents/DataScience/Seizures/data/models") saved_files = joblib.dump(forest_early, "RFV_" + p_subject + "_early.pkl") for saved_file in saved_files: os.system("mv " + saved_file + " /Users/dryu/Documents/DataScience/Seizures/data/models") return {"seizure":forest_seizure, "early":forest_early}
def token_matrix(outdir, data_generator, map_func): # transform token data into matrix vectorizer = CountVectorizer(tokenizer=lambda x: x.split(), lowercase=False) X = vectorizer.fit_transform(data_generator()) # extract indices train_df = pd.read_csv("data/train_v2.csv") test_df = pd.read_csv("data/sampleSubmission_v2.csv") train_idx = train_df["file"].apply(map_func).values test_idx = test_df["file"].apply(map_func).values # prepare X_train & X_test X_train, X_test = X[train_idx], X[test_idx] # create directory if it does not exist if not os.path.isdir(outdir): try: os.makedirs(outdir) except OSError as exception: if exception.errno != errno.EEXIST: raise # save matrices with open(os.path.join(outdir, "X_train.np"), "w") as fhandle: save_sparse_csr(fhandle, X_train) with open(os.path.join(outdir, "X_test.np"), "w") as fhandle: save_sparse_csr(fhandle, X_test) joblib.dump(vectorizer.vocabulary_, os.path.join(outdir, "vocabulary.pkl"))
def fit(self, dpacks, targets, nonfixed_pairs=None, cache=None): """ Extract whatever models or other information from the multipack that is necessary to make the parser operational Parameters ---------- mpack : MultiPack """ cache_file = (cache.get('attach') if cache is not None else None) # load cached classifier, if it exists if cache_file is not None and fp.exists(cache_file): # print('\tload {}'.format(cache_file)) self._learner_attach = joblib.load(cache_file) return self dpacks, targets = self.dzip(for_attachment, dpacks, targets) self._learner_attach.fit(dpacks, targets, nonfixed_pairs=nonfixed_pairs) # save classifier, if necessary if cache_file is not None: # print('\tsave {}'.format(cache_file)) joblib.dump(self._learner_attach, cache_file) return self
def save_prediction(self, model_name, predictions, type_n): self._check_type_n(type_n) if on_cloud: joblib.dump(predictions, model_name + "_prediction_" + type_n, compress=5) cloud.bucket.put(model_name + "_prediction_" + type_n, prefix="prediction") else: joblib.dump(predictions, path_join(self.prediction_dir, model_name + "_prediction_" + type_n), compress=5)
def write_test_pickle(to_pickle, args): kwargs = {} compress = args.compress method = args.method joblib_version = get_joblib_version() py_version = '{0[0]}{0[1]}'.format(sys.version_info) numpy_version = ''.join(np.__version__.split('.')[:2]) # The game here is to generate the right filename according to the options. body = '_compressed' if (compress and method == 'zlib') else '' if compress: if method == 'zlib': kwargs['compress'] = True extension = '.gz' else: kwargs['compress'] = (method, 3) extension = '.pkl.{0}'.format(method) if args.cache_size: kwargs['cache_size'] = 0 body += '_cache_size' else: extension = '.pkl' pickle_filename = 'joblib_{0}{1}_pickle_py{2}_np{3}{4}'.format( joblib_version, body, py_version, numpy_version, extension) try: joblib.dump(to_pickle, pickle_filename, **kwargs) except Exception as e: # With old python version (=< 3.3.), we can arrive there when # dumping compressed pickle with LzmaFile. print("Error: cannot generate file '{0}' with arguments '{1}'. " "Error was: {2}".format(pickle_filename, kwargs, e)) else: print("File '{0}' generated successfuly.".format(pickle_filename))
def BOWtransform(corpus,mode,idx): data_matrix=[] print('Transform data...') if mode == 'train': bow_transformer = BOWTransformer() data_matrix = bow_transformer.fit_transform(corpus) #save transform model jl.dump(bow_transformer,'{}/{}.model'.format(marcos.TRANSFORM_MODEL_DIR,idx)) elif mode == 'test': bow_transformer = jl.load('{}/{}.model'.format(marcos.TRANSFORM_MODEL_DIR,idx)) data_matrix = bow_transformer.transform(corpus) else: print("Unexpected mode in BOWtransform",file=sys.stderr) sys.exit() # turn dt matrix to list print ("The shape of dt matrix is {}\n".format(data_matrix.shape)) if sp.sparse.isspmatrix_csr(data_matrix): data_matrix = data_matrix.toarray().tolist() else: #pass through dimension reduction pipe data_matrix = data_matrix.tolist() return data_matrix
def dimReduction(corpus,mode,idx): print("Dimension reduction...") if sp.sparse.isspmatrix_csr(corpus): data_matrix = corpus.toarray() data_matrix=[] if mode == 'train': dim_reduc_pipe = marcos.DIMREDUC_PIPE dim_reduc_pipe.set_params(pca__n_components=1000) # bow_transformer = BOWTransformer() data_matrix = dim_reduc_pipe.fit_transform(corpus) #save transform model jl.dump(dim_reduc_pipe,'{}/{}.model_reduc'.format(marcos.TRANSFORM_MODEL_DIR,idx)) elif mode == 'test': dim_reduc_pipe = jl.load('{}/{}.model_reduc'.format(marcos.TRANSFORM_MODEL_DIR,idx)) data_matrix = dim_reduc_pipe.transform(corpus) else: print("Unexpected mode in BOWtransform",file=sys.stderr) sys.exit() # turn dt matrix to list print ("The shape of dt matrix is {} (after dimension reduction)\n".format(data_matrix.shape)) return data_matrix.tolist()
def train(self,descri,names): def distance_hist(point1, point2): return cv2.compareHist(np.array(point1,np.float32), np.array(point2,np.float32), cv2.cv.CV_COMP_BHATTACHARYYA) # unique, counts = np.unique(names, return_counts=True) # print dict(zip(unique, counts)) # R = zip(descri,names) # sorted_by_second = sorted(R, key=lambda tup: tup[1]) # descri = np.array(sorted_by_second)[:,0] # descri = np.array([D for D in descri]) # # D = distance_hist(descri[0],descri[0]) # Y = pdist(descri,'euclidean') # Y = squareform(Y) # Y = (Y/Y.max())*255 # # np.save("Matrix_NN_DL.npy",Y) # Size_block = 2 # Matri = np.zeros((Y.shape[0]*Size_block, Y.shape[1]*Size_block), np.float32) # for i in xrange(Y.shape[0]): # for j in xrange(Y.shape[1]): # Value = Y[i][j] # Matri[i * Size_block:(i + 1) * Size_block, j * Size_block:(j + 1) * Size_block] = Value # plt.imsave("Matriz_Distancias.jpg",Matri,cmap='hot') # plt.show() self.clf = NearestNeighbors(3) self.clf.fit(descri) self.names = names self.clases = np.unique(self.names) joblib.dump((self.clf, self.leaf_size, self.metric,self.names,self.clases), self.path, compress=3) return self.names
def run_gender(): ''' CAUTION!! Currently this script is set to run for age data distribution. ''' c = IndexedContext() index_file = os.path.join(DATA, 'libsvm_files/gender/paper/train.index') input_file = os.path.join(DATA, 'annotation/gender/paper/gender_train.csv') output_file = os.path.join(DATA, 'libsvm_files/gender/paper/train.libsvm') c.processFile(input_file, output_file) joblib.dump(c.getIndexer(), index_file) #indexer = joblib.load(index_file) #c.setIndexer(indexer) c.freeze() input_file = os.path.join(DATA, 'annotation/gender/paper/gender_test.csv') output_file = os.path.join(DATA, 'libsvm_files/gender/paper/test.libsvm') c.processFile(input_file, output_file)
def verify_suff_stats(self, Dchunk, SS, lap): ''' Run-time checks to make sure the suff stats have expected values ''' if self.savedir is not None: SSfile = os.path.join(self.savedir, 'SSdump-Lap%03d.dat' % (lap)) if self.isLastBatch(lap): joblib.dump(SS, SSfile) if hasattr(Dchunk, 'nDocTotal') and Dchunk.nDocTotal < 4000: if self.hasMove('birth') and self.do_birth_at_lap(lap): if self.algParams['birth']['earlyLap'] > 0: pass elif lap < np.ceil(lap): assert SS.nDoc - Dchunk.nDocTotal > -0.001 else: if abs(SS.nDoc - Dchunk.nDocTotal) > 0.01: print "WARNING @ lap %.2f | SS.nDoc=%d, nDocTotal=%d" % (lap, SS.nDoc, Dchunk.nDocTotal) assert abs(SS.nDoc - Dchunk.nDocTotal) < 0.01 elif lap >= 1.0: assert abs(SS.nDoc - Dchunk.nDocTotal) < 0.01 if hasattr(SS, 'N'): if not np.all(SS.N >= -1e-9): raise ValueError('N should be >= 0!') SS.N[SS.N < 0] = 0
def fit_with_params(params, X, firings, window_size, i): X = transform_data(X.as_matrix(), window_size) pid = os.getpid() print "fitting {}th iteration. PID: {}".format(i, pid) if params['e_n'] > params['e_w']: params['e_w'], params['e_n'] = params['e_n'], params['e_w'] spk_aggr_func = params['spk_aggr_func'] nrn_aggr_func = params['nrn_aggr_func'] dist_metric = params['dist_metric'] mgng_params = dict(params) del mgng_params['spk_aggr_func'] del mgng_params['nrn_aggr_func'] del mgng_params['dist_metric'] try: estimator = mgng.MGNG(**mgng_params) estimator.fit(X) winner_units = estimator.transform(X) score = mgng.scorer(winner_units, window_size, firings[firings.fire_idx < (len(winner_units) - window_size)], spk_aggr_func, nrn_aggr_func, dist_metric) ret_val = score + (params, pid) pprint.pprint(ret_val) dump(winner_units, 'winner_units_{}.pickle'.format(pid), compress=3) except Exception as e: pprint.pprint(e) ret_val = (-np.infty, -np.infty, np.infty, params, pid) print "{}th iteration finished. PID: {}".format(i, pid) with open('hyperparam_opt_{}.log'.format(pid), 'ab') as fp: fp.write('{}\n'.format(pprint.pformat(ret_val))) return ret_val
def train(corpus_file, out_file, mode, dim_size, window, min_count, negative, epoch, pool_size, chunk_size): with bz2.BZ2File(corpus_file) as f: sentences = LineSentence(f) sg = int(mode == 'sg') model = Word2Vec(sentences, size=dim_size, window=window, min_count=min_count, workers=pool_size, iter=epoch, negative=negative, sg=sg) words = [] entities = [] for (w, _) in model.vocab.iteritems(): if w.startswith(MARKER): entities.append(w[len(MARKER):].replace(u'_', u' ')) else: words.append(w) vocab = Vocab(Trie(words), Trie(entities)) word_embedding = np.zeros((len(words), dim_size), dtype=np.float32) entity_embedding = np.zeros((len(entities), dim_size), dtype=np.float32) for word in words: word_embedding[vocab.get_word_index(word)] = model[word] for entity in entities: entity_embedding[vocab.get_entity_index(entity)] = model[MARKER + entity.replace(u' ', u'_')] ret = dict( word_embedding=word_embedding, entity_embedding=entity_embedding, vocab=vocab, ) joblib.dump(ret, out_file, compress=False)
def dumpNetwork(self, fname, nEpoch=-1): """ Dump the network Parameters ----------- fname : string Name of the file where the network will be dumped nEpoch : int Epoch number (Optional) """ try: os.mkdir("nnets") except Exception: pass basename = "nnets/" + fname for f in os.listdir("nnets/"): if fname in f: os.remove("nnets/" + f) all_params = self._network.getAllParams() if (nEpoch>=0): joblib.dump(all_params, basename + ".epoch={}".format(nEpoch)) else: joblib.dump(all_params, basename, compress=True)
def transform(self, X, stride_size=1, save_to_file=None, memmap=False, force_rerun=False): """ Expects X to be in the shape of (n, x, y, chan) """ if not hasattr(self, 'centroids_'): raise RuntimeError("Model has not been fitted") if save_to_file is not None and os.path.exists(save_to_file) and not force_rerun: logger.info("File already exists, loading from {}".format(save_to_file)) if memmap: res = joblib.load(save_to_file, mmap_mode='r+') else: res = joblib.load(save_to_file) else: all_rows = range(X.shape[0]) chunked_rows = list(chunks(all_rows, self.n_jobs)) logger.info("Transforming in {} jobs, chunk sizes: {}".format(self.n_jobs, [len(x) for x in chunked_rows])) res = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( delayed(chunked_extract_features)(i, X, self.rf_size, self.centroids_, self.mean_, self.p_, True, stride_size, self.pool_method) for i in chunked_rows ) res = np.vstack(res) if save_to_file is not None: logger.info("Saving results to file {}".format(save_to_file)) joblib.dump(res, save_to_file) if memmap: res = joblib.load(save_to_file, mmap_mode='r+') return res
def save_model(self, model_dir): """ Save the model to `model_dir` Parameters ---------- model_dir: str, location where model is saved """ if os.path.isdir(model_dir): raise Exception('Folder already exists') else: os.mkdir(model_dir) # We clone the instance but do not clone the leaves since we will save them separately new_hkmnn_model = HKMNearestNeighbor(self.branching_factor, self.max_depth, self.leaf_size, self.batch_size, self.verbose) new_hkmnn_model.root = self._recursive_save(self.root, 0, [0] * self.max_depth, model_dir) # save skeleton file_name = os.path.join(model_dir, 'skeleton.pickle') joblib.dump(new_hkmnn_model, file_name, protocol=2)
def get_corpora(lang, num_train=500000, num_test=10000, distributed=False): full_corpus = corpora.get_corpus(lang, word_boundaries=True) # A list of (phoneme, precedes_boundary) tuples. phones_and_boundaries = extract_boundaries(full_corpus) # Divide into train and test. train, test = corpora.train_test_split(phones_and_boundaries, num_train, num_test, mode='end') # Separate phones from boundary markers. train_phones, _ = map(list, zip(*train)) test_phones, test_bounds = map(list, zip(*test)) joblib.dump(test_bounds, lang + '_bounds.pkl') return # Construct targets and encode phonemes. train_in, train_out = prepare(train_phones, distributed) test_in, test_out = prepare(test_phones, distributed) # Remove the trailing bound to match test_out. del test_bounds[-1] assert len(train_in) == len(train_out) assert len(test_in) == len(test_out) == len(test_bounds) return (train_in, train_out), (test_in, test_out), test_bounds
def save_database(self, file_path): """Saves the current data to disk Keyword Arguments: file_path (str) -- Path were youo wish to save the file, use an extension like .db""" dump(self, file_path, True)
def save_state(self, state_dict, itr=None): """ Saves the state of an experiment. To be clear: this is about saving *state*, not logging diagnostics. All diagnostic logging is separate from this function. This function will save whatever is in ``state_dict``---usually just a copy of the environment---and the most recent parameters for the model you previously set up saving for with ``setup_tf_saver``. Call with any frequency you prefer. If you only want to maintain a single state and overwrite it at each call with the most recent version, leave ``itr=None``. If you want to keep all of the states you save, provide unique (increasing) values for 'itr'. Args: state_dict (dict): Dictionary containing essential elements to describe the current state of training. itr: An int, or None. Current iteration of training. """ if proc_id()==0: fname = 'vars.pkl' if itr is None else 'vars%d.pkl'%itr try: joblib.dump(state_dict, osp.join(self.output_dir, fname)) except: self.log('Warning: could not pickle state_dict.', color='red') if hasattr(self, 'tf_saver_elements'): self._tf_simple_save(itr)
def get_mult_runs_data(design_doc, view_names, x_npy_file, y_npy_file): cb = Couchbase.connect(bucket=bucket_name, host=host_name) x = [[0, 0, 0, 0, 0, 0, 0, 0]] for view_name in view_names: rows = cb.query(design_doc, view_name) count = 0 for row in rows: x.append([row.value[1]['thread_alloc_count'], row.value[1]['proc_count'], row.value[1]['thread_alloc_size'], row.value[2]['mem_free'], row.value[2]['native_allocated_heap'], row.value[2]['native_free_heap'], row.value[2]['mem_total'], row.value[2]['native_heap'], row.value[3]['global_class_init'], row.value[3]['classes_loaded'], row.value[3]['total_methods_invoc'], row.value[4]['total_tx'], row.value[4]['total_rx']]) count = count + 1 print view_name + ' count: ' + `count` x.remove([0, 0, 0, 0, 0, 0, 0, 0]) joblib.dump(x, x_npy_file)
def process_single_traj(fn, topology, stride, outdir, featurizers): traj = None def load(): with timing('loading %s' % fn): t = md.load(fn, stride=stride, top=topology) print('Number of frames: %d' % t.n_frames) return t for f in featurizers: featurizer = f['featurizer'] outfile = construct_outfile(fn, f['suffix'], outdir) if os.path.exists(outfile): print('Skipping %s. File exists' % outfile, file=sys.stderr) continue if traj is None: traj = load() with timing('featurizing (%s)' % featurizer.__class__.__name__): X = featurizer.partial_transform(traj) with timing('dumping to %s' % outfile): dump(X, outfile, compress=0) if traj is None: print(' == Completely skipped: %s ==' % fn, file=sys.stderr)
def save_info(self, combos): """Save information about the sowed cases. """ # If saving Harvester or Runner, strip out function information so # as just to use pickle. if self.harvester is not None: harvester_copy = copy.deepcopy(self.harvester) harvester_copy.runner.fn = None hrvstr_pkl = pickle.dumps(harvester_copy) runner_pkl = None elif self.runner is not None: hrvstr_pkl = None runner_copy = copy.deepcopy(self.runner) runner_copy.fn = None runner_pkl = pickle.dumps(runner_copy) else: hrvstr_pkl = None runner_pkl = None joblib.dump({ 'combos': combos, 'batchsize': self.batchsize, 'num_batches': self.num_batches, '_batch_remainder': self._batch_remainder, 'harvester': hrvstr_pkl, 'runner': runner_pkl, }, os.path.join(self.location, INFO_NM))
def create_stacked_features(input_mel_file_name, output_examples_file_name): # Load audio. mel = joblib.load(input_mel_file_name) examples, labels, parameters = mel_to_example(mel, 'dummy_label') # Using compress=1 to make sure it is stored as one file. joblib.dump((examples, parameters), output_examples_file_name, compress=1)
def __init__(self,*args,**kwargs): super(score_locality_hash, self).__init__(*args,**kwargs) self.f_params = os.path.join( kwargs["output_data_directory"], "locality_hash_params.pkl") params = self.load_params(**kwargs) # Build the hash function lookup dim = self.M.syn0.shape[1] n_bits = int(kwargs['locality_n_bits']) alpha = float(kwargs['locality_alpha']) R = RBP_hasher(dim,n_bits,alpha) # We assume that all locality hashes will be the same, save these params to disk for key in ['dim', 'projection_count']: if key not in params: continue print "Checking if locality_hash({}) {}=={}".format(key, R.params[key], params[key]) if R.params[key] != params[key]: msg = "\nLocality-hash config value of {} does not match from {} to {}.\nDelete {} to continue." raise ValueError(msg.format(key, R.params[key], params[key], self.f_params)) if 'normals' in params: print "Loading locality hash from {}".format(self.f_params) R.load(params) else: joblib.dump(R.params, self.f_params) self.RBP_hash = R self.WORD_HASH = {} for w,v in zip(self.M.index2word, self.M.syn0): self.WORD_HASH[w] = self.RBP_hash(v)
def train(path): name = os.path.splitext(os.path.basename(path))[0] print('Processing: ', name) features = pd.read_csv(path, index_col=None) selected_features_names = [name for name, desc in selected_features] features = features[selected_features_names] split_idx = 1200 features = features.drop(['sound.files'], axis=1) noise_only_df, df = features.iloc[:split_idx], features.iloc[split_idx:] y = df.pop('petrel') X = df.values y_noise = noise_only_df.pop('petrel') X_noise = noise_only_df.values X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.25, random_state=42, stratify=y) hyperparams = { 'n_estimators': [100, 300, 500, 1000], 'learning_rate': [0.1], 'gamma': [0.0, 0.5], 'max_depth': [2, 3, 4], 'min_child_weight': [1, 2], 'subsample': [1.0, 0.8], 'reg_alpha': [0.0, 0.1], 'reg_lambda': [1, 2, 3] } # # hyperparams = { # 'n_estimators': [100], # 'learning_rate': [0.1], # 'gamma': [0.0], # 'max_depth': [2], # 'min_child_weight': [1], # 'subsample': [1.0], # 'reg_alpha': [0.0], # 'reg_lambda': [1] # } clf = model_selection.GridSearchCV(estimator=xg.XGBClassifier(objective='binary:logistic', n_jobs=-1), param_grid=hyperparams, cv=4) fit_params = clf.fit(X_train, y_train) estimator = fit_params.best_estimator_ joblib.dump(estimator, name + '_model.pkl') test_pred = estimator.predict(X_test) metrics = calculate_metrics(test_pred, y_test) noise_pred = estimator.predict(X_noise) noise_detection_accuracy = accuracy_score(y_noise, noise_pred) experiment = Experiment(api_key="4PdGdUZmGf6P8QsMa5F2zB4Ui", project_name="storm petrels", workspace="tracewsl") experiment.set_name(name) experiment.log_parameter('name', name) experiment.log_multiple_params(fit_params.best_params_) experiment.log_multiple_metrics(metrics) experiment.log_metric('Noise detection accuracy', noise_detection_accuracy) experiment.log_figure('Confusion matrix', get_confusion_matrix_figure(test_pred, y_test)) experiment.log_figure('Feature importnace', get_feature_importance_figure(estimator, list(df.columns.values)))
def motionEstTSS(curI, nextI, blockSize, stepSize, shiftSize): """ Computes motion vectors using 3-step search method Input: curI: The image for which we want to find motion vectors nextI: The reference image blockSize: stepSize: shiftSize: Ouput: velX, velY : the motion vectors for each direction """ # check if two images have the same size if nextI.shape != curI.shape: print "Two images do not have the same size" return [], [] # filepath for temp generated file used by parallel computation folder = tempfile.mkdtemp() curI_path = os.path.join(folder, 'curI') nextI_path = os.path.join(folder, 'nextI') velX_path = os.path.join(folder, 'velX') velY_path = os.path.join(folder, 'velY') # get pre-defined size height, width = curI.shape block_r = blockSize / 2 velSize = ((height + 1 - 2 * block_r) / shiftSize, (width + 1 - 2 * block_r) / shiftSize) # get the number of system cores num_cores = multiprocessing.cpu_count() """Pre-allocate a writeable shared memory map as a container for the results motion vectors of the parallel computation """ velX = np.memmap(velX_path, dtype=np.int32, shape=velSize, mode='w+') velY = np.memmap(velY_path, dtype=np.int32, shape=velSize, mode='w+') # Dump the input images to disk to free the memory dump(curI, curI_path) dump(nextI, nextI_path) """Release the reference on the original in memory array and replace it by a reference to the memmap array so that the garbage collector can release the memory before forking. gc.collect() is internally called in Parallel just before forking. """ curI = load(curI_path, mmap_mode='r') nextI = load(nextI_path, mmap_mode='r') # Fork the worker processes to perform motion vector computation concurrently Parallel(n_jobs=num_cores)(delayed(estTSS)(curI, nextI, velX, velY, i, j, block_r, stepSize, shiftSize, height, width) for i in range(velSize[0]) for j in range(velSize[1])) # try: # shutil.rmtree(folder) # except: # print("Failed to delete: " + folder) return velX, velY
def save_inverted_index(self, inverted_index, sub_folder, base_path=None): if (not os.path.exists(os.path.join(self.cache_path_, sub_folder))): os.makedirs(os.path.join(self.cache_path_, sub_folder)) if (base_path is None): base_path = self.cache_path_ joblib.dump(inverted_index, os.path.join(base_path, sub_folder, 'inverted_index.joblib'), compress=3)
X = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) ## Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) ## Applying Linear Regression Model from sklearn.linear_model import LinearRegression model = LinearRegression() model.fit(X_train, y_train) ## Saving the model from joblib import dump, load dump(model, 'Boston.joblib') ## Using the model from joblib import dump, load import numpy as np model = load('Boston.joblib')
mod = mt.generate_symbolic_model(T, V, ttheta, [0, 0, tau1]) # Zustandsraummodell, partiell linearisiert mod.calc_coll_part_lin_state_eq(simplify=True) x_dot = mod.ff + mod.gg * qddot1 # Zustandsdefinition anpassen und ZRM speichern replacements = { 'Matrix': 'sp.Matrix', 'sin': 'sp.sin', 'cos': 'sp.cos', 'q1': 'x1', 'qdot1': 'x2', 'qddot1': 'u1', 'p1': 'x3', 'pdot1': 'x4', 'p2': 'x5', 'pdot2': 'x6' } def str_replace_all(string, replacements): for (key, val) in replacements.items(): string = string.replace(key, val) return string x_dot = sp.Matrix([x_dot[2], x_dot[5], x_dot[0], x_dot[3], x_dot[1], x_dot[4]]) x_dot_str = str_replace_all(str(x_dot), replacements) dump({'x_dot_str': x_dot_str}, 'examples/double_pend_cart_pl.str')
# X_train = train_images.reshape(train_images.shape[0], train_images.shape[1]*train_images.shape[2])/255 # 此处,因为我们已经知道的样本的形态,所以可以直接书写值 X_train = train_images.reshape(60000, 28*28)/255 y_train = train_labels X_test = test_images.reshape(10000, 28*28)/255 y_test = test_labels # 为了提高训练速度,我们只提取10%的样本进行演示 X_train_lite = X_train[0:5999, :] y_train_lite = y_train[0:5999] X_test_lite = X_test[0:999, :] y_test_lite = y_test[0:999] # TODO: 3.训练MLP神经网络并输出预测结果 start = time.time() print('开始训练模型,请稍等...', end='') mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=[ 100, 100], activation='relu', alpha=1e-5, random_state=62) mlp.fit(X_train_lite, y_train_lite) # 保存mlp神经网络模型 ModelPath = os.path.join(os.getcwd(), 'Models', 'Ch08MNIST_lbfgs.pkl') joblib.dump(mlp, ModelPath) print('训练结束,用时{:.2f}s.'.format(time.time() - start)) print('训练集得分: {:.4f}, 测试集得分: {:.4f}'.format( mlp.score(X_train, y_train), mlp.score(X_test, y_test)))
os.chdir("C:/Users/Rahul/Desktop/edwisor") import importing x = importing.trn_term_doc label_cols = importing.label_cols train_data=importing.train_data def pr(y_i, y): p = x[y==y_i].sum(0) return (p+1) / ((y==y_i).sum()+1) def get_mdl(y): y = y.values r = np.log(pr(1,y) / pr(0,y)) m = LogisticRegression(C=4, dual=True) x_nb = x.multiply(r) return m.fit(x_nb, y), r dict1={} for i, j in enumerate(label_cols): print('fit', j) m,r = get_mdl(train_data[j]) dict1.update({j:[m,r]}) joblib.dump(dict1,'diction.pkl')
# hyperparameters n_quantiles = [10] output_distribution = ['normal'] penalty = ['l1'] C = np.logspace(-4, 4, 20) # parameter grid param_grid = { 'qt__n_quantiles': n_quantiles, 'qt__output_distribution': output_distribution, 'clf__penalty': penalty, 'clf__solver': ['saga'], 'clf__C': C, 'clf__max_iter': [1000] } clf_grid = GridSearchCV(pipeline, param_grid=param_grid, cv=gkf, scoring=scoring, refit=False, verbose=2, n_jobs=-1) search = clf_grid.fit(X, y) dump( search, 'models/logreg_gridsearch_pipeline_ALL_ach-at-hex_' + args.window_size + '_' + args.n_significant + '_' + args.n_classes + '_.joblib')
def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path)
y_test = data_test["v"].copy().values X_val = test.drop("v",axis=1).values y_val = test["v"].copy().values scaler_x = MinMaxScaler() scaler_y = MinMaxScaler() scaler_x.fit(X_train) X_train = scaler_x.transform(X_train) X_test = scaler_x.transform(X_test) scaler_y.fit(y_train.reshape(-1,1)) y_train = scaler_y.transform(y_train.reshape(-1,1)) y_test = scaler_y.transform(y_test.reshape(-1,1)) from ANFIS import EVOLUTIONARY_ANFIS E_Anfis = EVOLUTIONARY_ANFIS(functions=3,generations=500,offsprings=10, mutationRate=0.2,learningRate=0.2,chance=0.7,ruleComb="simple") bestParam, bestModel = E_Anfis.fit(X_train,y_train,optimize_test_data=False) bestParam, bestModel = E_Anfis.fit(X_train,y_train,X_test,y_test,optimize_test_data=True) import joblib joblib.dump(bestParam,'bestParam.joblib') joblib.dump(bestModel,'bestModel.joblib')
train_x, train_y = pkl.load(f) param_grid = { 'max_depth': [6, 10, 15, 20], 'learning_rate': [0.001, 0.01, 0.1, 0.2, 0, 3], 'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0], 'gamma': [0, 0.25, 0.5, 1.0], 'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0], 'n_estimators': [100] } gsearch1 = RandomizedSearchCV(estimator=xgboost.XGBRegressor(), param_distributions=param_grid, verbose=3, scoring='neg_mean_squared_error', cv=3, n_iter=100, random_state=42, n_jobs=-1) gsearch1.fit(train_x, train_y) print('best params') print(gsearch1.best_params_) print('best score') print(gsearch1.best_score_) # save grid search dump(gsearch1.best_estimator_, 'xgboost.model')
# Predicted # Negative Positive #Actual Negative TN FP # Positive FN TP #print(confusion_matrix(predict_label, result_predict, labels=['useless', 'useful'])) #print(classification_report(predict_label, result_predict)) #estimator = clf.estimators_[3] #dot_data = tree.export_graphviz( # estimator, # class_names=["useful", "useless"], # feature_names=["num_commits_open","lines_modified_open","files_modified_open","commits_on_files_touched","branch_hotness"], # filled=True, # rounded=True, # out_file=None # ) #graph = pdp.graph_from_dot_data(dot_data) #graph.write_png("bootstrap_tree.png") # それぞれのデータの正解した回数の平均 ave_list = [n / loop_count for n in useful_match] predict_data['hyouka_1'] = np.array(ave_list) predict_data['label'] = df_predict['useful'] print(predict_data) joblib.dump(predict_data, f'scripts/result/{project}.pkl')
max(hmm['val_grapheme_root_categorical_accuracy']) max(hmm['val_vowel_diacritic_categorical_accuracy']) max(hmm['val_consonant_diacritic_categorical_accuracy']) train = pd.concat([ pd.read_parquet('data/train_image_data_0.parquet'), pd.read_parquet('data/train_image_data_1.parquet'), pd.read_parquet('data/train_image_data_2.parquet'), pd.read_parquet('data/train_image_data_3.parquet') ]).set_index('image_id', drop=True) train.head() import cv2 import joblib len(original_images) original_images = {} for i, row in train.iterrows(): if i in original_images.keys(): continue image = 255 - row.values image = image.reshape(137, 236) image = image.astype(np.uint8) original_images[i] = image joblib.dump(original_images, 'data/original_images') print('hej')
def Model(): finalDataSet = pd.read_csv("finalDataSet.csv") finalDataSet.set_index("time", inplace=True) # print(df.tail()) ... foreCastColumn = "close" # creating label foreCastOut = int(12) # prediction for next 12 hrs finalDataSet["label"] = finalDataSet[foreCastColumn].shift(-foreCastOut) ... X = np.array(finalDataSet.drop(["label"], axis=1)) y = np.array(finalDataSet["label"]) # normalize data X = preprocessing.scale(X) XforeCastOut = X[-foreCastOut:] X = X[:-foreCastOut] y = y[:-foreCastOut] ... # Split the data into train and test data set tscv = TimeSeriesSplit(n_splits=5) for train_index, test_index in tscv.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] ... # regression model Model = LassoLars(alpha=0.01).fit(X_train, y_train) # EN = ElasticNet(alpha = 0.0001, l1_ratio = 0.5, random_state = 0).fit(X_train, y_train) ... # cross validated accucary on train set scores = cross_val_score(Model, X_train, y_train, cv=tscv) print("Training Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) print("Intercept:", Model.intercept_) print("Slope:", Model.coef_[0]) ... # prediction on training trainPredict = Model.predict(X_train) r_squared = r2_score(y_train, trainPredict) mae = np.mean(abs(trainPredict - y_train)) rmse = np.sqrt(np.mean((trainPredict - y_train)**2)) rae = np.mean(abs(trainPredict - y_train)) / np.mean( abs(y_train - np.mean(y_train))) rse = np.mean((trainPredict - y_train)**2) / np.mean( (y_train - np.mean(y_train))**2) sumOfDf = DataFrame(index=[ "R-squared", "Mean Absolute Error", "Root Mean Squared Error", "Relative Absolute Error", "Relative Squared Error", ]) sumOfDf["Training metrics"] = [r_squared, mae, rmse, rae, rse] # prediction of test testPredict = Model.predict(X_test) r_squared = r2_score(y_test, testPredict) mae = np.mean(abs(testPredict - y_test)) rmse = np.sqrt(np.mean((testPredict - y_test)**2)) rae = np.mean(abs(testPredict - y_test)) / np.mean( abs(y_test - np.mean(y_test))) rse = np.mean((testPredict - y_test)**2) / np.mean( (y_test - np.mean(y_test))**2) sumOfDf["Validation metrics"] = [r_squared, mae, rmse, rae, rse] sumOfDf = sumOfDf.round(decimals=3) print(sumOfDf) # accuracy check ... # Save model to file in the current working directory fileName = "LLModel.pkl" joblib.dump(Model, fileName) # Load from file LLModel = joblib.load(fileName) # forecast future 12 hrs values foreCastFutureValues = DataFrame(LLModel.predict(XforeCastOut)) ... # assigning names to columns foreCastFutureValues.rename(columns={0: "Forecast"}, inplace=True) newDataframe = finalDataSet.tail(foreCastOut) newDataframe.reset_index(inplace=True) newDataframe = newDataframe.append( DataFrame({ "time": pd.date_range( start=newDataframe.time.iloc[-1], periods=(len(newDataframe) + 1), freq="H", closed="right", ) })) newDataframe.set_index("time", inplace=True) newDataframe = newDataframe.tail(foreCastOut) foreCastFutureValues.index = newDataframe.index foreCastFutureValues.reset_index(inplace=True) return foreCastFutureValues
nb = SVC() from sklearn.pipeline import make_pipeline pipe = make_pipeline(vect, nb) # print(pipe.steps) pipe.fit(X.cutted_text, y) #from sklearn.model_selection import cross_val_score #print(cross_val_score(pipe, X.cutted_text, y, cv=20, scoring='accuracy').mean()) y_pred = pipe.predict(X_test.cutted_text) import joblib joblib.dump(pipe, "./model.joblib") import pickle pickle.dump(pipe, open("./model.pickle", 'wb')) from sklearn import metrics print(metrics.accuracy_score(y_test, y_pred)) from sklearn.metrics import f1_score print(f1_score(y_test, y_pred))
def save_serialized(clf, filename_with_path): """ save model to a file """ storage_dir = os.path.dirname(filename_with_path) if storage_dir != "": os.makedirs(storage_dir, exist_ok=True) joblib.dump(clf, filename_with_path)
def train(self, tr_x, tr_y, va_x=None, va_y=None): # 乱数固定 ModelNN().set_tf_random_seed() # 出力ディレクトリ作成 os.makedirs(self.params["out_dir"], exist_ok=True) # データのセット・スケーリング validation = va_x is not None scaler = self.params["scaler"] # StandardScaler() scaler.fit(tr_x) tr_x = scaler.transform(tr_x) # ラベルone-hot化 tr_y = to_categorical(tr_y, num_classes=self.params["nb_classes"]) # モデル構築 self.build_model((tr_x.shape[1],)) hist = None if validation: va_x = scaler.transform(va_x) va_y = to_categorical(va_y, num_classes=self.params["nb_classes"]) cb = [] cb.append( ModelCheckpoint( filepath=os.path.join( self.params["out_dir"], f"best_val_loss_{self.run_fold_name}.h5" ), monitor="val_loss", save_best_only=True, # verbose=1, verbose=0, ) ) # cb.append(ModelCheckpoint(filepath=os.path.join(self.params["out_dir"], f"best_val_acc_{self.run_fold_name}.h5"), # monitor="val_acc", # save_best_only=True, # verbose=1, # mode="max", # ) # ) cb.append( EarlyStopping( monitor="val_loss", patience=self.params["patience"], verbose=1 ) ) hist = self.model.fit( tr_x, tr_y, epochs=self.params["nb_epoch"], batch_size=self.params["batch_size"], # verbose=2, verbose=0, validation_data=(va_x, va_y), callbacks=cb, ) else: hist = self.model.fit( tr_x, tr_y, epochs=self.params["nb_epoch"], batch_size=self.params["batch_size"], # verbose=2, verbose=0, ) # スケーラー保存 self.scaler = scaler joblib.dump( self.scaler, os.path.join(self.params["out_dir"], f"{self.run_fold_name}-scaler.pkl"), ) # history plot self.plot_hist_acc_loss(hist) return hist
def save(self, checkpoint_path: pathlib.Path) -> any: file_name = f"{self.name}.pth" dump(self.model, str(checkpoint_path / file_name)) return file_name
import pandas as pd from joblib import dump from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer answers_df = pd.read_excel('answers_base.xlsx') queries_df = pd.read_excel('queries_base.xlsx') queries_df = queries_df[['Текст вопроса', 'Номер связки\n']].dropna() queries_train, queries_test = train_test_split(queries_df, test_size=0.3, random_state=0) documents = answers_df['Текст вопросов'].append(queries_train['Текст вопроса'], ignore_index=True) documents_prep = documents.apply( lambda x: ' '.join(preprocessing.preprocessing(x))) documents_ner = documents.apply(lambda x: ' '.join( preprocessing.preprocessing(preprocessing.preprocess_ner(x)))) vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(documents_prep) vectorizer_ner = TfidfVectorizer() X_ner = vectorizer_ner.fit_transform(documents_ner) dump(X, 'text_representations/tfidf.pkl') dump(vectorizer, 'text_representations/vectorizer.pkl') dump(X_ner, 'text_representations/tfidf_ner.pkl') dump(vectorizer_ner, 'text_representations/vectorizer_ner.pkl')
model.fit(x, y) print("Fin del entrenamiento") if __name__ == "__main__": print("Iniciando") solver = sys.argv[1] # client = Client(processes=False, threads_per_worker=4, # n_workers=2, memory_limit='3GB') # print(client) mlp = neural_network.MLPRegressor( hidden_layer_sizes=(16,), solver=solver, verbose=10, activation='relu', batch_size=32, learning_rate_init=0.01, # funciona mejor tol=1e-3, early_stopping=False, epsilon=1e-4, n_iter_no_change=3) # mlp = load('mlp.joblib') initial_time = time.now() train(mlp, 'train_data.csv') end_time = time.now() print("Time training:") print("Saving model") dt_now= datetime.datetime.now().isoformat() dump(mlp, f'mlp.joblib') print("Finished")
y=placement_coded.status from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=0.8,random_state=1) # predict if a student is placed or not. from sklearn.linear_model import LogisticRegression #from sklearn import metrics logreg = LogisticRegression() logreg.fit(X_train, y_train) #y_pred = logreg.predict(X_test) #print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test))) # Save your model import joblib joblib.dump(logreg, 'model.pkl') print("Model dumped!") # Load the model that you just saved lr = joblib.load('model.pkl') # Saving the data columns from training model_columns = list(X_train.columns) joblib.dump(model_columns, 'model_columns.pkl') print("Models columns dumped!")
def pre_training_data(is_scaler=True, is_categorical=False, bin_method='bins', is_1hot_categ=True): u""" return train, label, vali_train, vali_label,X_test,y_test type: ndarray """ # only test image3 X_train, X_val, y_train, y_val = None, None, None, None X_train, X_val, y_train, y_val = load_data_200(X_train, X_val, y_train, y_val, data_path = './Images') X_train, X_val, y_train, y_val = load_data_200(X_train, X_val, y_train, y_val, data_path = './Images2') X_train, X_val, y_train, y_val = load_data_200(X_train, X_val, y_train, y_val, data_path = './Images3') X_train, X_val, y_train, y_val = load_data_200(X_train, X_val, y_train, y_val, data_path = './Images4') X_train, X_val, y_train, y_val = load_data_200(X_train, X_val, y_train, y_val, data_path = './Images6') X_train, X_val, y_train, y_val = load_data_200(X_train, X_val, y_train, y_val, data_path = './Images7_high_range') # X_train, X_val, y_train, y_val = load_data_200(X_train, X_val, y_train, y_val, data_path = './Images8_high_range') X_train, X_val, y_train, y_val = load_data_200(X_train, X_val, y_train, y_val, data_path = './Images9_200_300') X_train, X_val, y_train, y_val = load_data_200(X_train, X_val, y_train, y_val, data_path = './Images5') # X_test, X_val, y_test_origin, y_val_origin = train_test_split(X_val, y_val_origin, test_size=0.5, shuffle=True) X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.6, shuffle=True) print("train shape", str(X_train.shape)) print("test shape",str(X_test.shape)) print("val shape",str(X_val.shape)) if is_scaler: # scaler = preprocessing.MaxAbsScaler() MaxAbsScaler scaler = preprocessing.MaxAbsScaler()#StandardScaler y_train = scaler.fit_transform(y_train.reshape(-1, 1)) y_train = y_train.flatten() joblib.dump(scaler, 'MaxAbsScaler.pkl') # scaler_val = preprocessing.MaxAbsScaler() y_val = scaler.transform(y_val.reshape(-1, 1)) y_val = y_val.flatten() y_test = scaler.transform(y_test.reshape(-1, 1)) y_test = y_test.flatten() # joblib.dump(scaler_val, 'MaxAbsScaler_vali.pkl') if is_categorical: y_test = np.array([processing_y(i, default_method=bin_method) for i in y_test]) y_train = np.array([processing_y(i, default_method=bin_method) for i in y_train]) y_val = np.array([processing_y(i, default_method=bin_method) for i in y_val]) is_categorical = False print(y_val) print(y_val.shape) if len(np.unique(y_train)) >= 2 and is_1hot_categ == True: is_categorical = True y_train = keras.utils.to_categorical(y_train) y_val = keras.utils.to_categorical(y_val) y_test = keras.utils.to_categorical(y_test) # print(y_test)### np.save('X_train.npy', X_train) np.save('X_val.npy', X_val) np.save('y_train.npy', y_train) np.save('y_val.npy', y_val) np.save('X_test.npy', X_test) np.save('y_test.npy', y_test) # print(X_train.shape)### # print(X_val.shape)### # print(X_test.shape)### # print(y_test)### # (train,label),(vali_train,vali_label) = CNN_Regression.load_data() train = X_train label = y_train vali_train = X_val vali_label = y_val return train, label, vali_train, vali_label,X_test,y_test
output, h = model(inputs, h) # calculate loss test_loss = criterion(output.squeeze(), labels.float()) test_losses.append(test_loss.item()) # convert output probabilities to predicted class (0 or 1) pred = torch.round(output.squeeze()) # rounds to the nearest integer # compare predictions to true label correct_tensor = pred.eq(labels.float().view_as(pred)) correct = np.squeeze(correct_tensor.numpy()) num_correct += np.sum(correct) # -- stats! -- ## # avg test loss print("Test loss: {:.3f}".format(np.mean(test_losses))) # accuracy over all test data test_acc = num_correct/len(test_loader.dataset) print("Test accuracy: {:.3f}".format(test_acc)) print(' S A V I N G M O D E L') joblib.dump(model, 'model.pkl') print('M O D E L S A V E D..............')
def build_model(dbname): mongodb = dbname ranktable = mongodb + 'rank' client = pymongo.MongoClient('localhost', 27017) dataname = client[mongodb] global table2 len_dict = {} table2 = dataname[ranktable] global df df = pd.DataFrame(data=list(table2.find())) df = df.drop(columns=['_id', 'date', 'usernick', 'skuId']) df = df.loc[lambda df: df["content"] != "此用户未填写评价内容"] df = df.loc[lambda df: df["score"] != 3] df['sentiment'] = df['score'].apply(lambda x: 1 if x > 3 else 0) df_neg = df.loc[lambda df: df["sentiment"] == 0] df_pos = df.loc[lambda df: df["sentiment"] == 1] sample_size = min(df_neg.shape[0], df_pos.shape[0]) if sample_size == df_neg.shape[0]: df_pos = df_pos.sample(n=sample_size, random_state=None) else: df_neg = df_neg.sample(n=sample_size, random_state=None) print('df_neg', df_neg.shape) print('df_pos', df_pos.shape) df = pd.concat([df_pos, df_neg]) X = df[['content']] y = df.sentiment X['cutted_comment'] = X.content.apply(chinese_word_cut) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # 数据降维 stop_words_file = "stopwords.txt" stopwords = get_custom_stopwords(stop_words_file) max_df = 0.8 min_df = 3 vect = CountVectorizer(max_df=max_df, min_df=min_df, token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b', stop_words=frozenset(stopwords)) term_matrix = pd.DataFrame(vect.fit_transform( X_train.cutted_comment).toarray(), columns=vect.get_feature_names()) nb = MultinomialNB() pipe = make_pipeline(vect, nb) cross_score = cross_val_score(pipe, X_train.cutted_comment, y_train, cv=5, scoring='accuracy').mean() print(f"====训练交叉预测准确率:{cross_score}====") # 模型拟合 pipe.fit(X_train.cutted_comment, y_train) pipe.predict(X_test.cutted_comment) y_pred = pipe.predict(X_test.cutted_comment) model_acc = metrics.accuracy_score(y_test, y_pred) print(f"====模型预测准确率:{model_acc}====") confusion_m = metrics.confusion_matrix(y_test, y_pred) print("=======混淆矩阵======") print(confusion_m) print("====================") print("正在保存模型......") print("====================") model_file_name = dbname + '_trained_model.pkl' joblib.dump(pipe, model_file_name) print("保存成功")
def run_experiment(X, y, alphas, seed, method, data_name, proc_train, proc_unlab, cv, num_c, num_gamma, verbose, n_jobs): sensitives = np.unique(X[:, -1]) X_train, y_train, X_unlab, X_test, y_test = split_data( X, y, proc_train, proc_unlab, seed) if not isinstance(alphas, dict): alphas_dict = {} for s in sensitives: alphas_dict[s] = alphas else: alphas_dict = alphas SIGNATURE = '{}_{}_'.format(data_name, method) scaler = StandardScaler() scaler.fit(X_train[:, :-1]) X_train[:, :-1] = scaler.transform(X_train[:, :-1]) X_unlab[:, :-1] = scaler.transform(X_unlab[:, :-1]) X_test[:, :-1] = scaler.transform(X_test[:, :-1]) n_train, d = X_train.shape if n_train > d: dual = False else: dual = True methods = { "LR": LogisticRegression(solver='liblinear'), "L-SVC": CalibratedClassifierCV(LinearSVC(dual=dual)), "RF": RandomForestClassifier(), "RF+": RandomForestClassifier() # "RBF-SVC": SVC(probability=True), } Cs = np.logspace(-4, 4, num_c) gammas = np.logspace(-4, 4, num_gamma) pows = np.array([1, 15 / 16, 7 / 8, 3 / 4, 1 / 2, 1 / 4, 1 / 8, 1 / 16, 0]) ds = np.unique((d**pows).astype('int')) if method[-1] == "+": randomize = True postfix = "+" method = method[:-1] else: randomize = False postfix = "" # randomize = True if method[-1] == "+" else False parameters = { "LR": { "C": Cs }, "L-SVC": { "base_estimator__C": Cs }, "RF": { "max_features": ds }, # "RF+" : {"max_features" : ds} # "RBF-SVC" : {"C" : Cs, "gamma" : gammas} } key = method BASE_MODEL_SIGNATURE = "{}_{}{}_{}".format(data_name, method, postfix, seed) BASE_MODEL_PATH = 'results/models/{}.pkl'.format(BASE_MODEL_SIGNATURE) ''' Base model does not depend on alpha. Load if it exists, and fit it if not. ''' try: clf = joblib.load(BASE_MODEL_PATH) print('Model {} loaded'.format(BASE_MODEL_SIGNATURE)) except: print('Model {} not found. Fitting ...'.format(BASE_MODEL_SIGNATURE)) clf = GridSearchCV(methods[key], parameters[key], cv=cv, refit=True, verbose=verbose, n_jobs=n_jobs) clf.fit(X_train, y_train) joblib.dump(clf, BASE_MODEL_PATH) ''' Transformation step is cheap, we do not save it. ''' transformer = TransformDPAbstantion(clf, alphas=alphas_dict, randomize=randomize) transformer.fit(X_unlab) y_pred = transformer.predict(X_test) y_pred_unf = clf.predict(X_test) # For test data fairness_test = compute_dp(y_test, X_test[:, -1]) # print_report(fairness_test, 'Test') # For base method accuracy_base = risk(y_test, y_pred_unf, X_test[:, -1]) fairness_base = compute_dp(y_pred_unf, X_test[:, -1]) # print_report(accuracy_base, 'Base') # print_report(fairness_base, 'Base') # For our method accuracy_our = risk(y_test, y_pred, X_test[:, -1]) fairness_our = compute_dp(y_pred, X_test[:, -1]) reject_our = classififcation_rate(y_pred, X_test[:, -1]) # print_report(accuracy_our, 'Our') # print_report(fairness_our, 'Our') print_report(reject_our, 'Our') results = { 'test': fairness_test, 'base': { **accuracy_base, **fairness_base }, 'our': { **accuracy_our, **fairness_our, **reject_our } } return results
import sys sys.path.insert(0, "/usr/local/lib/python2.7/site-packages") import pandas as pd from sklearn.neural_network import MLPClassifier from joblib import dump, load import numpy as np df = pd.read_csv('../data/ensembled_data.csv') X = df[['Convolutional NN', 'Random Forest', 'SVM', 'Dump']] y = np.ravel(df[['Correct Line']]) model = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(3), random_state=1) model.fit(X, y) dump(model, '../data/nn_ensembled.joblib')
f1 = resreg.f1_score(y_test, y_pred, error_threshold=5, relevance_true=relevance_true, relevance_pred=relevance_pred, relevance_threshold=0.5, k=1e4) mse_bins = resreg.bin_performance(y_test, y_pred, bins, metric='MSE') # Store performance results r2_store.append(r2) mse_store.append(mse) mcc_store.append(mcc) f1_store.append(f1) mse_bins_store.append(mse_bins) # Performance statistics r2_mean, r2_std = np.mean(r2_store), np.std(r2_store) mse_mean, mse_std = np.mean(mse_store), np.std(mse_store) f1_mean, f1_std = np.mean(f1_store), np.std(f1_store) mcc_mean, mcc_std = np.mean(mcc_store), np.std(mcc_store) mse_bins_store = pd.DataFrame(mse_bins_store) mse_bins_mean, mse_bins_std = np.mean(mse_bins_store, axis=0), np.std(mse_bins_store, axis=0) # Combine all performance data and write to excel spreadsheet means = [r2_mean, mse_mean, f1_mean, mcc_mean] + list(mse_bins_mean) stds = [r2_std, mse_std, f1_std, mcc_std] + list(mse_bins_std) store = [param] + means + stds # Save performance results as a binary file (to be read and analyzed later) joblib.dump(store, f'hpc/joblib_files/{strategy}_{2}.pkl')
# selecting a machinelearning model # estimator = BaggingClassifier(SVC(C=3, kernel = 'rbf', gamma='auto', probability=True, class_weight=class_weights), n_jobs=-1, verbose=1) # very slow estimator = SVC(C=10, kernel='rbf', gamma='auto', probability=True, class_weight=class_weights) # this is a bit slow # estimator = RandomForestClassifier(n_estimators=100, class_weight=class_weights, n_jobs=-1) # estimator = DecisionTreeClassifier(max_features=10, class_weight=class_weights) # train model and predict test data estimator.fit(X_train, y_train_int) y_pred = estimator.predict(X_test) dump(estimator, "estimator_dump") # print some metrices print("#### Evaluation #### \n") print("confusion matrix:") print(confusion_matrix(y_test_int, y_pred)) print("classification report: ") print(classification_report(y_test_int, y_pred)) print("balanced_accuracy_score: ") print(balanced_accuracy_score(y_test_int, y_pred)) # create ROC curve # see https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
"release_extension", "outs_when_up", "pitch_type_CH", "was_3_2", "pitch_type_SI", "pitch_type_FT", "pitch_type_FC", "bat_score", "post_bat_score", "pitch_type_CU", "was_3_1", "pitch_type_FS", "was_1_1", "was_0_1", "was_2_1", "was_1_2", "was_2_2", "was_0_2" ] for item in to_pop: df.pop(item) y = df.pop("description").values y_lookup, y = np.unique(y, return_inverse=True) X = df.values print(df.columns) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, stratify=y, random_state=42) gbm = gb_classifier(subsample=0.7, learning_rate=0.1, max_depth=5, n_estimators=300, verbose=1) gbm.fit(X_train, y_train) print(gbm.score(X_test, y_test)) print(gbm.feature_importances_) dump(gbm, 'reduced_gbm.py.joblib')
# -*- encoding: utf-8 -*- """ 8.8.5 模型持久化 """ import joblib from sklearn.datasets import load_wine from sklearn.svm import SVC X, y = load_wine(return_X_y=True) svc = SVC() svc.fit(X, y) joblib.dump(svc, r'..\res\svc.m') # 持久化模型 svc = joblib.load(r'..\res\svc.m') # 加载模型
# result = model.evals_result() # print("eval's results :", result) # r2 = r2_score(y_predict, y_test) # print("r2 Score : %.2f%%" %(r2 * 100.0)) # print("r2 :", r2) y_predict = model.predict(x_test) acc = accuracy_score(y_predict, y_test) print("acc : ", acc) ################################################################################### # import pickle#파이썬에서 제공하는 피클 # pickle.dump(model, open("./model/xgb_save/cancer.pickle.data", "wb")) # print("SAVED!!!!") #피클과 잡립을 비교해보아라 둘다 저장라는 방법임 # from joblib import dump, load import joblib joblib.dump(model, "./model/xgb_save/cancer.joblib.data") print("SAVED!!!!") # 불러오기 # model2= pickle.load(open("./model/xgb_save/cancer.pickle.data", "rb")) model2 = joblib.load(open("./model/xgb_save/cancer.joblib.data")) print("LOADED!!!!불러왔다. ") y_predict = model.predict(x_test) acc = accuracy_score(y_predict, y_test) print("acc : ", acc) #저장을하고 다시 불러온거에서 두개의 애큐러시가 동일한지를 확인해주는 과정을 거치면 두개다 동일한거에서 저장하고 다시 불러오기 한것임을 알 수 있다.
def ModelFit(): global best_model #contruct hyperparameter grid param_dist = {"max_depth": [3, 10, 20, 70, None], "max_features": [2, 10, 41, 80, 'sqrt'], "min_samples_split": sp_randint(2, 11), "min_samples_leaf": sp_randint(1, 11), #"bootstrap": [True, False], "criterion": ["gini", "entropy"], "n_estimators": [100, 300, 500, 800, 1000]} pprint(param_dist) #define random forest classifier function rf = RandomForestClassifier(random_state = 120) #search across 1000 randomized combinations in the above grid estimator = RandomizedSearchCV(estimator = rf, param_distributions = param_dist, n_iter = 1000, cv = 10, verbose = 10, random_state = 12, scoring = 'roc_auc', n_jobs = -1) #fit the model grid_result = estimator.fit(X_train, y_train) #find and define best estimator based on grid search best_model = grid_result.best_estimator_ print('\nbest_model:\n', best_model) #predict y based on test data y_pred = grid_result.predict(X_test) #accuracy score print('accuracy score:', accuracy_score(y_test, y_pred)) #confusion matrix tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() print(tn,fp,fn,tp) #classification report print('\nclassification report:\n',classification_report(y_test, y_pred)) #AUC and ROC curve y_pred_prob = grid_result.predict_proba(X_test)[:,1] auc = roc_auc_score(y_test, y_pred_prob) print('auc:', auc) false_positive, true_positive, _ = roc_curve(y_test, y_pred_prob) font = {'fontname':'Helvetica'} plt.figure() plt.plot([0, 1], [0, 1], 'k--') plt.plot(false_positive, true_positive, color='black') plt.xlabel('False positive rate', **font) plt.ylabel('True positive rate', **font) plt.savefig('feces_roc.png', dpi=300) plt.show() # Save the model as a pickle in a file joblib.dump(grid_result, 'campy_rf_feces.pkl') #determine best features feature_importances = grid_result.best_estimator_.feature_importances_ column_names=list(feces) del column_names[-0] importance = pd.DataFrame(feature_importances, index=column_names, columns=["Importance"]) sort_importance = importance.sort_values(by=['Importance'], ascending = False) sort_column_names = sort_importance.index.values.tolist() mult = 100/(sort_importance['Importance'].iloc[0]) sort_imp_mult = sort_importance * mult top_imp = sort_imp_mult['Importance'].iloc[0:15].tolist() top_column_names = sort_column_names[0:15] top_column_names = ['AvgMaxGustSpeed1.6', 'AvgAverageHumidity1.7', 'AverageHumidityTwoDayBefore', 'AvgMaxGustSpeed1.3', 'AvgMaxGustSpeed1.5', 'AvgMinTemperature1.7', 'AvgMaxWindSpeed1.7', 'AvgMinHumidity1.4', 'AvgMaxHumidity1.3', 'AvgPrecipitation1.4', 'MaxGustSpeedOneDayBefore', 'AvgMaxGustSpeedS1.2', 'AvgMaxWindSpeed1.4', 'AvgAverageHumidity1.3', 'MaxGustSpeedTwoDayBefore'] plt.rcParams.update(plt.rcParamsDefault) y_ticks = np.arange(0, len(top_column_names)) fig, ax = plt.subplots() ax.barh(y_ticks, top_imp, color = "dimgray") ax.set_yticklabels(top_column_names, **font) ax.set_yticks(y_ticks) plt.xlabel('Relative Importance', **font) fig.tight_layout() plt.gca().invert_yaxis() plt.savefig('feces_var.png', dpi=300) plt.show() return
def given_saved_some_step(multiply_by, name, path): some_step1 = MultiplyByN(multiply_by=multiply_by) some_step1.name = name dump(some_step1, path)