def gen_top_k_group_by_model(group_by_field, click_weight = utils.click_weight, year = 'all'): """ Generate and dump the group by model with top k hotel clusters :param group_by_field: group by field to generate the group with hotel cluster relevance scores :param click_weight: the weight for the clicks :param year: Year filter on training data :return: the topk group by model with respect to group_by_field """ dump_path = utils.model_path + \ '_'.join(['top', str(utils.k), 'cw', str(utils.click_weight), 'group', group_by_field, 'year', year]) + '.pkl' if os.path.exists(dump_path): print 'file: ' + dump_path + ' exists!' return source = train if year == '2013': source = train_2013 elif year == '2014': source = train_2014 agg = source.groupby([group_by_field, 'hotel_cluster'])['is_booking'].agg(['sum','count']) agg['count'] -= agg['sum'] agg = agg.rename(columns = {'sum':'bookings','count':'clicks'}) agg['relevance'] = agg['bookings'] + click_weight * agg['clicks'] # the weighted sum of bookings count and clicks count agg.reset_index(inplace = True) top_clusters = agg.groupby([group_by_field]).apply(top_k_relevence) top_clusters = pd.DataFrame(top_clusters).rename(columns={0:'hotel_cluster'}) joblib.dump(top_clusters, dump_path)
def training_stage3(dftrain,dfvalid,cat1,i): fname = ddir + 'joblib/stage3_'+str(cat1)+ext df = dftrain[dftrain.Categorie1 == cat1].reset_index(drop=True) dfv = dfvalid[dfvalid.Categorie1 == cat1].reset_index(drop=True) labels = np.unique(df.Categorie3) if len(labels)==1: joblib.dump((labels,None,None),fname) scv = -1 sct = -1 print 'training',cat1,'\t\t(',i,') : N=',len(df),'K=',len(labels) print 'training',cat1,'\t\t(',i,') : training=',sct,'validation=',scv return (sct,scv) vec,X = vectorizer_stage3(df.txt) Y = df['Categorie3'].values cla = LogisticRegression(C=best_regularisation.get(cat1,100)) cla.fit(X,Y) labels = np.unique(df.Categorie3) sct = cla.score(X[:min(10000,len(df))],Y[:min(10000,len(df))]) if len(dfv)==0: scv = -1 else: Xv = vec.transform(dfv.txt) Yv = dfv['Categorie3'].values scv = cla.score(Xv,Yv) print 'training',cat1,'\t\t(',i,') : N=',len(df),'K=',len(labels) print 'training',cat1,'\t\t(',i,') : training=',sct,'validation=',scv joblib.dump((labels,vec,cla),fname) del vec,cla return (sct,scv)
def predict_test(self,clf, tag): np.random.seed(1919) if os.path.isdir('../model/'+tag) == False: os.mkdir('../model/'+tag) print "Dir made : "+str(datetime.datetime.now()) print "Fit Started : "+str(datetime.datetime.now()) clf.fit(self.X, self.y) print "Dump Started : "+str(datetime.datetime.now()) joblib.dump(clf, '../model/'+tag+'/'+tag+'.pkl') print "Prediction Started : "+str(datetime.datetime.now()) output_arr = clf.predict_proba(self.x_test) f = open("../data/output_"+str(tag), "w") f.write("id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9\n") i=1 for row in output_arr: row = map(str, row) f.write(str(i)+","+str(",".join(row))+"\n") i += 1 f.close() print "ALL DONE : "+str(datetime.datetime.now())
def main(): pos_features_path = '/home/retailyze/Downloads/INRIAPerson/checkb/cropped/svm/featuresPos160_60.npy' neg_features_path = '/home/retailyze/Downloads/INRIAPerson/checkb/cropped/svm/featuresNeg160_60.npy' saving_loc = '/home/retailyze/Downloads/INRIAPerson/checkb/cropped/svm/' pos_features = np.load(pos_features_path)[:, 0::3] neg_features = np.load(neg_features_path) [:, 0::3] train, val = prepare_features(pos_features, neg_features, True, saving_loc) del pos_features del neg_features clf = svm.SVC(kernel='rbf') logging.info('starts training') clf.fit(train[:, 1:], train[:, 0]) del train logging.info('starts predicting') predicted = clf.predict(val[:, 1:]) conf_mat = confusion_matrix(predicted, val[:, 0]) acc = accuracy_score(val[:, 0], predicted) del val del predicted logging.info('Confusion matrix: %s' %conf_mat) logging.info('Accuracy: %s' %acc) logging.info('saving model') joblib.dump(clf, join(saving_loc, 'svm_rbf_scaled.pkl'))
def transform_data(): from solaris.run import load_data from sklearn.externals import joblib data = load_data('data/data.pkl') kringing = PertubatedKriging() #kringing = PertubatedSpline() data['description'] = '%r: %r' % (kringing, kringing.est) print data['description'] print('_' * 80) print(kringing) print for key in ['train', 'test']: print('_' * 80) print('transforming %s' % key) print X = data['X_%s' % key] X = kringing.fit_transform(X) data['X_%s' % key] = X print print('dumping data') joblib.dump(data, 'data/interp10_data.pkl') IPython.embed()
def train(self): with gzip.open(constants.TRAIN_EXPANDED, 'r') as source: reader = csv.reader(source) next(reader, None) n_sample = 0 labels = [] features = [] for feature_vector in reader: s_features = feature_vector[2:6] + feature_vector[7:] s_label = int(feature_vector[1]) features.append(s_features) labels.append(s_label) # print 'features', s_features # print 'labels', s_label # print 'norm features', normalized_features n_sample += 1 if n_sample % 500000 == 0: self.clf.partial_fit(features, labels) features = [] labels = [] print 'Processing sample [%s]' % n_sample print 'Finished training' print 'Estimated parameters [%s]' % self.clf.get_params() # saving model into file joblib.dump(self.clf, constants.MODEL_FILENAME, compress=9)
def tuning_xgbst(X, y): clf = xgb.XGBClassifier(n_estimators=10000, scale_pos_weight=1.0, #1400.0/13458.0, max_depth=6, objective='binary:logistic', learning_rate=0.02, gamma=0.1, min_child_weight=3, max_delta_step=0, subsample=0.7, colsample_bytree=0.4, colsample_bylevel=1.0, reg_alpha=0, reg_lambda=3000, seed=0, nthread=-1) print clf.get_params() # skf = StratifiedShuffleSplit(y, n_iter=5, test_size=0.25, random_state=0) skf = StratifiedKFold(y, n_folds=3, random_state=0) fold = 1 for train_index, val_index in skf: print 'fold ', fold X_train, X_val = X[train_index], X[val_index] y_train, y_val = y[train_index], y[val_index] # eval_metric use the parameters in XGBoost doc clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric='auc', early_stopping_rounds=1000, verbose=False) print "best_score", clf.best_score joblib.dump(clf, './models/xgbst/CV_' + str(fold) + '.model') fold += 1
def train_svm(feedback, classes): print "Building n-grams" X_train_counts = count_vect.fit_transform(feedback) # converting string to the bag - of - words form, using bi-grams X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) # weighting the words from bag-of-words form ''' The vocabulary used here on the training set needs to be saved for classification matters, what this means is that the number of words during training is going to be different of the number of words in classification but the count still needs to be the same, in other words if the word "animal" is associated with the integer 3 during training, it has to be associated again with number 3 during classification ''' pickle.dump(count_vect.vocabulary_,open("feature.pkl","wb")) # Saving vocab print "Saving words features" c = svm.SVC(kernel = 'rbf',gamma = 0.001, C = 100) print "training SVM" c.fit(X_train_tfidf, classes) # Training the SVM print "Training completed..." joblib.dump(c, 'filename.pkl', compress= 9) # Saving the Support vectors
def extract_features(): des_type = 'HOG' # If feature directories don't exist, create them if not os.path.isdir(pos_feat_ph): os.makedirs(pos_feat_ph) # If feature directories don't exist, create them if not os.path.isdir(neg_feat_ph): os.makedirs(neg_feat_ph) print "Calculating the descriptors for the positive samples and saving them" for im_path in glob.glob(os.path.join(pos_im_path, "*")): #print im_path im = imread(im_path, as_grey=True) if des_type == "HOG": fd = hog(im, orientations, pixels_per_cell, cells_per_block, visualize, normalize) fd_name = os.path.split(im_path)[1].split(".")[0] + ".feat" fd_path = os.path.join(pos_feat_ph, fd_name) joblib.dump(fd, fd_path) print "Positive features saved in {}".format(pos_feat_ph) print "Calculating the descriptors for the negative samples and saving them" for im_path in glob.glob(os.path.join(neg_im_path, "*")): im = imread(im_path, as_grey=True) if des_type == "HOG": fd = hog(im, orientations, pixels_per_cell, cells_per_block, visualize, normalize) fd_name = os.path.split(im_path)[1].split(".")[0] + ".feat" fd_path = os.path.join(neg_feat_ph, fd_name) joblib.dump(fd, fd_path) print "Negative features saved in {}".format(neg_feat_ph) print "Completed calculating features from training images"
def train(self, seg_corpus, dep_corpus, path=None): assert seg_corpus.keys() == dep_corpus.keys() features, labels = self.extract_features_from_corpus( dep_corpus, seg_corpus=seg_corpus) self._train(features, labels) if path is not None: joblib.dump(self.pipeline, path, compress=1, cache_size=1e9)
def check_covertype(datasets_folder): print("Checking availability of the covertype dataset") archive_path = os.path.join(datasets_folder, 'covtype.data.gz') covtype_dir = os.path.join(datasets_folder, "covertype") samples_path = os.path.join(covtype_dir, "samples.pkl") targets_path = os.path.join(covtype_dir, "targets.pkl") if not os.path.exists(covtype_dir): os.makedirs(covtype_dir) if not os.path.exists(archive_path): print("Downloading dataset from %s (10.7MB)" % COVERTYPE_URL) open(archive_path, 'wb').write(urlopen(COVERTYPE_URL).read()) else: print("Found archive: " + archive_path) if not os.path.exists(samples_path) or not os.path.exists(targets_path): print("Parsing the data and splitting input and labels...") f = open(archive_path, 'rb') Xy = np.genfromtxt(gzip.GzipFile(fileobj=f), delimiter=',') X = Xy[:, :-1] y = Xy[:, -1].astype(np.int32) joblib.dump(X, samples_path) joblib.dump(y, targets_path ) print("=> Success!")
def fetch_vgg_architecture(caffemodel_parsed=None, caffemodel_protobuffer=None): """Fetch a pickled version of the caffe model, represented as list of dictionaries.""" default_filename = os.path.join(VGG_PATH, 'vgg.pickle') if caffemodel_parsed is not None: if os.path.exists(caffemodel_parsed): return joblib.load(caffemodel_parsed) else: if os.path.exists(default_filename): import warnings warnings.warn('Did not find %s, but found %s. Loading it.' % (caffemodel_parsed, default_filename)) return joblib.load(default_filename) else: if os.path.exists(default_filename): return joblib.load(default_filename) # We didn't find the file: let's create it by parsing the protobuffer protobuf_file = fetch_vgg_protobuffer_file(caffemodel_protobuffer) model = _parse_caffe_model(protobuf_file) if caffemodel_parsed is not None: joblib.dump(model, caffemodel_parsed) else: joblib.dump(model, default_filename) return model
def gbm_fit(params, cv_folds): gbm = GradientBoostingRegressor(**params) gbm.fit(x_train, y_train) # Check accuracy of model # No need for validation data because of cross validation # Training data is split up into cv_folds folds: # Model trained on (cv_folds - 1) of the folds; last fold is saved as validation set cv_scores_mse = cross_validation.cross_val_score(gbm, x_train, y_train, cv=cv_folds, scoring='mean_squared_error') print '\nModel Report' print ('MSE Score: Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g' % (np.mean(cv_scores_mse), np.std(cv_scores_mse), np.min(cv_scores_mse), np.max(cv_scores_mse))) feat_imp = pd.Series(gbm.feature_importances_, features).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') plt.show() # Check actual performance on test data final_predictions = gbm.predict(x_test) test['health_score_in_week'] = final_predictions test.to_csv(output_file, columns=['user_id', 'date', 'steps', 'total_sleep', 'resting_hr', 'step_week_slope', 'sleep_week_slope', 'hr_week_slope', 'curr_health_score', 'health_score_in_week']) # Save the model to file 'health_prediction.pkl' joblib.dump(gbm, 'health_prediction.pkl', compress=1)
def train(trainingData, pklFile): # ========================================================================= # # =============== STEP 1. DEFINE OUTPUT LEARNT MODEL FILE ================= # # ========================================================================= # if (pklFile == ''): os.system('rm -rf learntModel & mkdir learntModel') pklFile = 'learntModel/learntModel.pkl' # ========================================================================= # # ================= STEP 2. PREPARE AND FORMATTING DATA =================== # # ========================================================================= # NUMBER_OF_FEATURES = len(trainingData[0]) - 1 NUMBER_OF_TRAINING_POINTS = len(trainingData) x = trainingData[:, range(0, NUMBER_OF_FEATURES)] y = trainingData[:, NUMBER_OF_FEATURES] # ========================================================================= # # ============== STEP 3. DECLARE PRIMITIVES BEFORE THE PARTY ============== # # ========================================================================= # minSquareError = np.inf targetAlpha = None alphas = np.logspace(-10, -2, 500) # ========================================================================= # # ===== STEP 4. PERFORM FITTING WITH THE BEST ALPHA AND SAVE THE MODEL ==== # # ========================================================================= # clf = LogisticRegressionCV(Cs=alphas) clf.fit(x, y) joblib.dump(clf, pklFile) return {"intercept": clf.intercept_, "coef":clf.coef_, "alpha":clf.C_, "accuracy":clf.score(x,y)}
def trainFixed(): ''' train a machine learner based on data from some fixed parameter point. save to fixed.pkl ''' print "Entering train fixed" trainAndTarget = np.loadtxt('traindata.dat') traindata = trainAndTarget[:,0:2] targetdata = trainAndTarget[:,2] massPoints = np.unique(traindata[:,1]) chunk = len(traindata)/len(massPoints)/2 shift = len(traindata)/2 #plot for fixed mu=0 training print "training fixed" clf = svm.NuSVR() reducedtrain = np.concatenate((traindata[4*chunk : 5*chunk,0], traindata[4*chunk+shift : 5*chunk+shift , 0])) reducedtarget = np.concatenate((targetdata[4*chunk : 5*chunk], targetdata[4*chunk+shift : 5*chunk+shift])) clf.fit(reducedtrain.reshape((len(reducedtrain),1)), reducedtarget) joblib.dump(clf, 'fixed.pkl')
def single_run(X, y, estimator, train, test, estimator_idx, split_idx, output_dir=None): X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] if output_dir is not None: debug_folder = join(output_dir, "split_{}_est_{}".format(split_idx, estimator_idx)) if not os.path.exists(debug_folder): os.makedirs(debug_folder) estimator.set_params(debug_folder=debug_folder) estimator.fit(X_train, y_train, probe_list=[(X_test, y_test)]) # estimator.fit(X_train, y_train) else: estimator.fit(X_train, y_train) y_hat = estimator.predict(X_test) score = np.sqrt(mean_squared_error(y_hat, y_test)) print('RMSE %s: %.3f' % (estimator, score)) if output_dir is not None: with open(join(debug_folder, 'score'), 'w+') as f: f.write('score : %.4f' % score) dump(estimator, join(debug_folder, 'estimator'), compress=9) return score
def primaryMetLookup(): fhin = open('CosmicMutantExport_v60_190712.csv', 'rU') fhin.readline() data1 = fhin.readlines() fhin.close() sampleOrigin = {} for line in data1[1:]: flds= line.split(',') if flds[4] not in sampleOrigin: # in case the psite does not exist sampleOrigin[flds[4]] = Set([flds[23]]) else: originSet = sampleOrigin[flds[4]] originSet.add(flds[23]) sampleOrigin[flds[4]] = originSet keys = sampleOrigin.keys() samples = 0 amb = 0 for k in keys: origin = sampleOrigin[k] if len(origin) > 1: amb+=1 else: samples+=1 print 'ambiguous samples = ', amb print 'fine samples = ', samples print 'total number of ', str(samples+amb) joblib.dump(sampleOrigin, 'samples_origin.pkl')
def train_pipeline(kind, cut, vectorizer, model_trainer, do_cut=False, do_vectorizer=False, record_num=None): print('reading...') alltext, accu_label, law_label, time_label = data.read_trainData("./data/data_train.json", record_num) if do_cut: print('cutting...') train_text = cut.cut(alltext) joblib.dump(train_text, './data/{}_cut_train.txt'.format(cut.name)) print('cleaning...') cleaner = Cleaner() cleaned_train_text = cleaner.clean(train_text) joblib.dump(cleaned_train_text, './data/{}_cut_train_cleaned.txt'.format(cut.name)) else: print('load existing cut file {}...'.format('./data/{}_cut_train_cleaned.txt'.format(cut.name))) cleaned_train_text = joblib.load('./data/{}_cut_train_cleaned.txt'.format(cut.name)) vectorizer_name = '{}_{}'.format(cut.name, vectorizer.name) if do_vectorizer: print('{} training...'.format(vectorizer_name)) vectorizer = vectorizer.train(cleaned_train_text) joblib.dump(vectorizer, './model/{}/predictor/model/{}_vectorizer.model'.format(model_trainer.name, vectorizer_name)) print('{} vectorizing...'.format(vectorizer)) vec = vectorizer.transform(cleaned_train_text) joblib.dump(vec, './data/vec_{}.txt'.format(vectorizer_name)) else: print('load existing vec file {}...'.format('./data/vec_{}.txt'.format(vectorizer_name))) vec = joblib.load('./data/vec_{}.txt'.format(vectorizer_name)) print('{} training...'.format(kind)) model = model_trainer.train(vec, accu_label) joblib.dump(model, './model/{}/predictor/model/{}_{}.model'.format(model_trainer.name, vectorizer_name, kind))
def main(): """ Generates features and fits classifier. """ featureIndexes = processData(os.path.join(dataFolder, "avito_train.tsv"), itemsLimit=300000) trainFeatures,trainTargets, trainItemIds=processData(os.path.join(dataFolder,"avito_train.tsv"), featureIndexes, itemsLimit=300000) testFeatures, testItemIds=processData(os.path.join(dataFolder,"avito_test.tsv"), featureIndexes) joblib.dump((trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds), os.path.join(dataFolder,"train_data.pkl")) trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(os.path.join(dataFolder,"train_data.pkl")) logging.info("Feature preparation done, fitting model...") clf = SGDClassifier( loss="log", penalty="l2", alpha=1e-4, class_weight="auto") clf.fit(trainFeatures,trainTargets) logging.info("Predicting...") predicted_scores = clf.predict_proba(testFeatures).T[1] logging.info("Write results...") output_file = "avito_starter_solution.csv" logging.info("Writing submission to %s" % output_file) f = open(os.path.join(dataFolder,output_file), "w") f.write("id\n") for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True): f.write("%d\n" % (item_id)) f.close() logging.info("Done.")
def compute(filename): fileArray = filename.split("/") operator = fileArray[-1].split(".")[0] print 'SVM received operator = ' + operator ip = open(filename) i = 0 A = [] B = [] for line in ip: temp = [] elm = line.rstrip("\n").split(" "); temp = [np.exp(float(row)) for row in range(len(elm)-1) ] # A.append([temp[0] ,temp[1]]) A.append(temp) B.append(np.float(elm[len(elm)-1])) clf = svm.SVR() clf.fit(A,B) # f.close() modelURI = "Models/"+operator+"/" if not os.path.exists(modelURI): os.makedirs(modelURI) modelURI += 'm.pkl' joblib.dump(clf, modelURI) print 'SUCCESS,' + modelURI + ' written to disk.'
def normalize_one(name): out_name = path(name).splitext()[0] + '.dat' a = sio.loadmat(name) desc = a['desc'] frames = a['frames'] normalize_sift(desc, inplace=True) dump(dict(frames=frames, desc=desc), out_name, compress=3)
def trainClassifier(clf, dir,model_file='adaptive', data_file='train', seed=1234, ): ''' Train classifier ''' print 'Training classifier' data = np.loadtxt('{0}/train_{1}.dat'.format(dir,data_file)) traindata = data[:,:-1] targetdata = data[:,-1] pdb.set_trace() if model_g == 'mlp': train_mlp((traindata, targetdata), save_file='{0}/{1}_F0_F1.pkl'.format(dir,model_file)) else: rng = np.random.RandomState(seed) indices = rng.permutation(traindata.shape[0]) traindata = traindata[indices] targetdata = targetdata[indices] scores = cross_validation.cross_val_score(clf, traindata, targetdata) print "Accuracy: {0} (+/- {1})".format(scores.mean(), scores.std() * 2) clf.fit(traindata,targetdata) #clf.plot_importance_matrix(vars_names) joblib.dump(clf, '{0}/{1}_F0_F1.pkl'.format(dir,model_file))
def setTestInputforNN(self, collection={}, sel_words=[]): list_of_strings = [] list_of_salary = [] count = 0 sel_words_set = set(sel_words) sel_words_list = list(sel_words_set) for document in collection: count += 1 title = document.getTitle() description = document.getDescription() salary = (int)(document.getSalaryNorm()) words = re.split(" ", title) + re.split(" ", description) # words = [x for x in words if x in sel_words] wordsUnique = set(words) wordsUnique = wordsUnique & sel_words_set words = [x for x in words if x in wordsUnique] documentString = " ".join(words) list_of_strings.append(documentString) list_of_salary.append(salary) if not (count % 15000): break vectorizer = CountVectorizer(vocabulary=sel_words, min_df=1) self.inp = vectorizer.fit_transform(list_of_strings) from sklearn.externals import joblib joblib.dump(self.inp.tocsr(), "test_dataset_in.joblib") self.inp_size = len(list_of_strings) output = np.array(list_of_salary) self.target = output.reshape(len(list_of_strings), 1) joblib.dump(self.target, "test_dataset_out.joblib") return [self.inp, self.target]
def save_classifier(cl, fn, use_joblib=True, **kwargs): """Save a classifier to disk. Parameters ---------- cl : classifier object Pickleable object or a classify.VigraRandomForest object. fn : string Writeable path/filename. use_joblib : bool, optional Whether to prefer joblib persistence to pickle. kwargs : keyword arguments Keyword arguments to be passed on to either `pck.dump` or `joblib.dump`. Returns ------- None Notes ----- For joblib persistence, `compress=3` is the default. """ if isinstance(cl, VigraRandomForest): cl.save_to_disk(fn) elif use_joblib and sklearn_available: if not kwargs.has_key('compress'): kwargs['compress'] = 3 joblib.dump(cl, fn, **kwargs) else: with open(fn, 'w') as f: pck.dump(cl, f, protocol=kwargs.get('protocol', -1))
def rf_fit(): train_inp,valid_inp,train_target,valid_target = prepare_input() rf = RandomForestClassifier(random_state=31,n_jobs=-1,verbose=1,n_estimators=100,min_samples_split=5) start = time.time() rf.fit(train_inp,train_target) end = time.time() print "fitting took {:0.4} seconds".format(end-start) training_output = rf.predict_proba(train_inp) validation_output = rf.predict_proba(valid_inp) training_error = log_loss(train_target,training_output) validation_error = log_loss(valid_target,validation_output) print "Train error: {:02.4f}".format(training_error) print "Validation error: {:02.4f}".format(validation_error) joblib.dump(rf,rf_filename) return rf
def trainModel(): # 数据预处理 data_train = joblib.load('data/data_train.pkl') label_train = joblib.load('data/label_train.pkl') print data_train.shape clf = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.1, degree=0.1, gamma=1.0, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=True) #clf.set_params(kernel='rbf') print clf print data_train.shape print label_train.shape print 'begin training....' clf.fit(data_train,label_train) print 'finish training....' print clf joblib.dump(clf, 'model/svm.pkl') return None
def perform_cluster_analysis(dataset): filename = 'elbow_plot.dat' if os.path.exists(cpath + filename): data = joblib.load(cpath + filename) K = data[0] meandistortions = data[1] else: X = dataset print 'X Shape: ', X.shape #K = range(1, 50, 5) K = [1, 2, 5, 10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000] #K = [1, 2, 5, 10, 50, 100] meandistortions = [] cluster_centers = [] for k in K: print k kmeans = KMeans(n_clusters=k, n_jobs=3) kmeans.fit(X) #import ipdb; ipdb.set_trace() # debugging code #meandistortions.append(sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1))/X.shape[0]) meandistortions.append(kmeans.inertia_) cluster_centers.append(kmeans.cluster_centers_) #print 'k: ', k, ' Cluster Centers: ', kmeans.cluster_centers_ data = [K, meandistortions] joblib.dump(data, cpath + filename, compress=8) plot_name = "elbow_plot.png" title = 'Selecting k with the Elbow Method' xlabel = 'Number of Clusters (k)' ylabel = 'Average Distortion' xyplot(K, meandistortions, 0, 0, 0, 0, title, xlabel, ylabel, staticpath + plot_name, line=1, y_log=0)
def xgb_fit(): train_inp,valid_inp,train_target,valid_target = prepare_input() dtrain = xgb.DMatrix(train_inp,label=train_target) dvalid = xgb.DMatrix(valid_inp) param = {'max_depth':10, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' } param['nthread'] = 4 param['eval_metric'] = 'auc' param['subsample'] = 0.7 param['colsample_bytree']= 0.7 param['min_child_weight'] = 0 param['booster'] = "gblinear" watchlist = [(dtrain,'train')] num_round = 300 early_stopping_rounds=10 bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds) joblib.dump(bst,bst_filename) train_pred = bst.predict(xgb.DMatrix(train_inp)) valid_pred = bst.predict(xgb.DMatrix(valid_inp))
def train_classifier(): pos_feat_path = positive_features_path neg_feat_path = negative_features_path model_path = classifier_model_path feature_vectors = [] labels = [] for feat_path in glob.glob(os.path.join(pos_feat_path, "*.feat")): fd = joblib.load(feat_path) print len(fd) if len(fd): fd = fd.astype(numpy.object) feature_vectors.append(fd) labels.append(1) for feat_path in glob.glob(os.path.join(neg_feat_path, "*.feat")): fd = joblib.load(feat_path) print len(fd) if len(fd): fd = fd.astype(numpy.object) feature_vectors.append(fd) labels.append(0) classifier = LinearSVC() print "Training classifier" classifier.fit(feature_vectors, labels) print "Classifier successfully trained" if not os.path.isdir(os.path.split(model_path)[0]): os.makedirs(os.path.split(model_path)[0]) joblib.dump(classifier, model_path)
X_eval = [x[0] for x in samples_eval] Y_train = [x[1] for x in samples_train] Y_eval = [x[1] for x in samples_eval] clf = svm.SVC(C=1.0, cache_size=200, class_weight="balanced", coef0=0.0, decision_function_shape='ovo', degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False) clf.fit(X_train, Y_train) predicted = clf.predict_proba(X_eval) score = clf.score(X_eval, Y_eval) decided = clf.decision_function(X_eval) expected = Y_eval joblib.dump(clf, 'model.pkl', protocol=0) print("Samples: " + str(len(X_train))) print(score)
data = { "train": { "X": X_train, "y": y_train }, "test": { "X": X_test, "y": y_test } } # list of numbers from 0.0 to 1.0 with a 0.05 interval alphas = np.arange(0.0, 1.0, 0.05) for alpha in alphas: # Use Ridge algorithm to create a regression model reg = Ridge(alpha=alpha) reg.fit(data["train"]["X"], data["train"]["y"]) preds = reg.predict(data["test"]["X"]) mse = mean_squared_error(preds, data["test"]["y"]) run.log('alpha', alpha) run.log('mse', mse) model_file_name = 'ridge_{0:.2f}.pkl'.format(alpha) # save model in the outputs folder so it automatically get uploaded with open(model_file_name, "wb") as file: joblib.dump(value=reg, filename=os.path.join('./outputs/', model_file_name)) print('alpha is {0:.2f}, and mse is {1:0.2f}'.format(alpha, mse))
test_size=0.35, random_state=0) print "Starting the training process..." #Start the training process clf.fit(X_train, Y_train) #If SHOW_CONFUSION_MATRIX is true, prints the confusion matrix if SHOW_CONFUSION_MATRIX: print "Confusion Matrix:" Y_predicted = clf.predict(X_test) print confusion_matrix(Y_test, Y_predicted) print "\nBest estimator parameters: " print clf.best_estimator_ #Calculates the score of the best estimator found. score = clf.score(X_test, Y_test) print "\nSCORE: {score}\n".format(score=score) print "Saving the model...", #Saves the model to the "model.pkl" file joblib.dump(clf, 'model.pkl') #Saves the classes to the "classes.pkl" file joblib.dump(classes, 'classes.pkl') print "DONE"
def get_saved_columns(file='enc_columns.txt'): with open(file, 'r') as f: return eval(f.readline()) from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler X = data_enc.iloc[:].values sc = MinMaxScaler() X = sc.fit_transform(X.astype(np.float64)) #Save scaler from sklearn.externals import joblib joblib.dump(sc, "scaler.save") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) import keras from keras.models import Sequential from keras.layers import Dense, Dropout from keras import optimizers classifier = Sequential() classifier.add( Dense(units=135, kernel_initializer='glorot_normal',
from sklearn import tree # Bumpy: 0 # Smooth: 1 features = [[140, 1], [130, 1], [150, 0], [170, 0]] # Apple: 0 # Orange: 1 labels = [0, 0, 1, 1] classifier = tree.DecisionTreeClassifier() classifier = classifier.fit(features, labels) from sklearn.externals import joblib joblib.dump(classifier, 'myModel.pkl')
def train_and_generate_model(): #global scaler K.clear_session() data_len = len(exchange_rates) train_len = int(len(exchange_rates) / TRAINDATA_DIV) print("data size: " + str(data_len)) print("train len: " + str(train_len)) tr_input_mat = [] tr_angle_mat = [] for i in range(1000, train_len, OUTPUT_LEN): tr_input_mat.append([ exchange_rates[i], (exchange_rates[i] - exchange_rates[i - 1]) / exchange_rates[i - 1], get_rsi(exchange_rates, i), get_ma(exchange_rates, i), get_ma_kairi(exchange_rates, i), get_bb_1(exchange_rates, i), get_bb_2(exchange_rates, i), get_ema(exchange_rates, i), get_ema_rsi(exchange_rates, i), get_cci(exchange_rates, i), get_mo(exchange_rates, i), get_lw(exchange_rates, i), get_ss(exchange_rates, i), get_dmi(exchange_rates, i), get_vorarity(exchange_rates, i), get_macd(exchange_rates, i), judge_chart_type(exchange_rates[i - CHART_TYPE_JDG_LEN:i]) ]) tr_input_mat.append([ reverse_exchange_rates[i], (reverse_exchange_rates[i] - reverse_exchange_rates[i - 1]) / reverse_exchange_rates[i - 1], get_rsi(reverse_exchange_rates, i), get_ma(reverse_exchange_rates, i), get_ma_kairi(reverse_exchange_rates, i), get_bb_1(reverse_exchange_rates, i), get_bb_2(reverse_exchange_rates, i), get_ema(reverse_exchange_rates, i), get_ema_rsi(reverse_exchange_rates, i), get_cci(reverse_exchange_rates, i), get_mo(reverse_exchange_rates, i), get_lw(reverse_exchange_rates, i), get_ss(reverse_exchange_rates, i), get_dmi(reverse_exchange_rates, i), get_vorarity(reverse_exchange_rates, i), get_macd(reverse_exchange_rates, i), judge_chart_type(reverse_exchange_rates[i - CHART_TYPE_JDG_LEN:i]) ]) tmp = (exchange_rates[i + OUTPUT_LEN] - exchange_rates[i]) / float(OUTPUT_LEN) if tmp >= 0: tr_angle_mat.append(1) else: tr_angle_mat.append(0) tmp = (reverse_exchange_rates[i + OUTPUT_LEN] - reverse_exchange_rates[i]) / float(OUTPUT_LEN) if tmp >= 0: tr_angle_mat.append(1) else: tr_angle_mat.append(0) X = np.array(tr_input_mat, dtype=np.float32) Y = np.array(tr_angle_mat, dtype=np.float32) X, scaler = preprocess_data(X) Y, encoder = preprocess_labels(Y) joblib.dump(scaler, "./sklearn.scaler.dump") np.random.seed(1337) # for reproducibility nb_classes = Y.shape[1] print(nb_classes, 'classes') dims = X.shape[1] print(dims, 'dims') neuro_num = 50 # setup deep NN model = Sequential() model.add(Dense(neuro_num, input_shape=(dims, ), activation="relu")) #model.add(Dense(neuro_num, activation="relu")) #model.add(BatchNormalization((neuro_num,))) model.add(BatchNormalization()) model.add(Dropout(0.5)) model.add(Dense(int(neuro_num / 2), activation="relu")) #model.add(BatchNormalization((neuro_num/2,))) model.add(BatchNormalization()) model.add(Dropout(0.5)) model.add(Dense(nb_classes, activation="sigmoid")) model.summary() model.compile(loss='binary_crossentropy', optimizer="adam") # # TPU tpu_grpc_url = "grpc://" + os.environ["COLAB_TPU_ADDR"] tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( tpu_grpc_url) strategy = keras_support.TPUDistributionStrategy(tpu_cluster_resolver) tpu_model = tf.contrib.tpu.keras_to_tpu_model(model, strategy=strategy) print("Training model...") start = time.time() tpu_model.fit(X, Y, batch_size=1024, epochs=1000, verbose=2, validation_split=0.15) process_time = time.time() - start print("excecution time of training: " + str(process_time)) dump_fd = open("./keras.model.json", "w") model_json_str = model.to_json() dump_fd.write(model_json_str) model.save_weights("./keras.weight") dump_fd.close()
def save(self, save_name): joblib.dump(self, save_name, compress=6)
#print(data.columns) new_data = data[["pclass", "sex"]] print(new_data.head()) new_output = data[["survived"]] print(new_output.head()) new_data['pclass'].replace('3rd', 3, inplace=True) new_data['pclass'].replace('2nd', 2, inplace=True) new_data['pclass'].replace('1st', 1, inplace=True) #new_data['sex'].replace('female',0,inplace=True) #new_data['sex'].replace('male',1,inplace=True) new_data['sex'] = np.where(new_data['sex'] == 'female', 0, 1) X_tr, X_te, y_tr, y_te = train_test_split(new_data, new_output, test_size=0.33, random_state=42) print(new_data.head()) print('After train_test_split') print(X_tr.shape) print(X_te.shape) print(y_tr.shape) print(y_te.shape) rf = RandomForestClassifier(n_estimators=100) rf.fit(X_tr, y_tr) acc = rf.score(X_te, y_te) print(acc * 100) joblib.dump(rf, 'rf1', compress=9)
def batchtrain(self, njob = 1, phase = None, shared_memory = 'no', verbose='on'): t0 = time() nnodes = getattr(self, 'nnodes') dlen = getattr(self, 'dlen') dim = getattr(self, 'dim') mapsize = getattr(self, 'mapsize') ############################################# # seting the parameters initmethod = getattr(self,'initmethod') mn = np.min(mapsize) if mn == 1: mpd = float(nnodes*10)/float(dlen) else: mpd = float(nnodes)/float(dlen) ms = max(mapsize[0],mapsize[1]) if mn == 1: ms = ms/2. #Based on somtoolbox, Matlab #case 'train', sTrain.trainlen = ceil(50*mpd); #case 'rough', sTrain.trainlen = ceil(10*mpd); #case 'finetune', sTrain.trainlen = ceil(40*mpd); if phase == 'rough': #training length trainlen = int(np.ceil(30*mpd)) #radius for updating if initmethod == 'random': radiusin = max(1, np.ceil(ms/3.)) radiusfin = max(1, radiusin/6.) # radiusin = max(1, np.ceil(ms/1.)) # radiusfin = max(1, radiusin/2.) elif initmethod == 'pca': radiusin = max(1, np.ceil(ms/8.)) radiusfin = max(1, radiusin/4.) elif phase == 'finetune': #train lening length #radius for updating if initmethod == 'random': trainlen = int(np.ceil(50*mpd)) radiusin = max(1, ms/12.) #from radius fin in rough training radiusfin = max(1, radiusin/25.) # radiusin = max(1, ms/2.) #from radius fin in rough training # radiusfin = max(1, radiusin/2.) elif initmethod == 'pca': trainlen = int(np.ceil(40*mpd)) radiusin = max(1, np.ceil(ms/8.)/4) radiusfin = 1#max(1, ms/128) radius = np.linspace(radiusin, radiusfin, trainlen) ################################################## UD2 = getattr(self, 'UD2') New_Codebook_V = np.empty((nnodes, dim)) New_Codebook_V = getattr(self, 'codebook') #print 'data is in shared memory?', shared_memory if shared_memory == 'yes': data = getattr(self, 'data') Data_folder = tempfile.mkdtemp() data_name = os.path.join(Data_folder, 'data') dump(data, data_name) data = load(data_name, mmap_mode='r') else: data = getattr(self, 'data') #X2 is part of euclidean distance (x-y)^2 = x^2 +y^2 - 2xy that we use for each data row in bmu finding. #Since it is a fixed value we can skip it during bmu finding for each data point, but later we need it calculate quantification error X2 = np.einsum('ij,ij->i', data, data) if verbose=='on': print '%s training...' %phase print 'radius_ini: %f , radius_final: %f, trainlen: %d' %(radiusin, radiusfin, trainlen) neigh_func = getattr(self,'neigh') for i in range(trainlen): if neigh_func == 'Guassian': #in case of Guassian neighborhood H = np.exp(-1.0*UD2/(2.0*radius[i]**2)).reshape(nnodes, nnodes) if neigh_func == 'Bubble': # in case of Bubble function # print radius[i], UD2.shape # print UD2 H = l(radius[i],np.sqrt(UD2.flatten())).reshape(nnodes, nnodes) + .000000000001 # print H t1 = time() bmu = None bmu = self.para_bmu_find(data, New_Codebook_V, njb = njob) if verbose=='on': print #updating the codebook t2 = time() New_Codebook_V = self.update_codebook_voronoi(data, bmu, H, radius) #print 'updating nodes: ', round (time()- t2, 3) if verbose=='on': print "epoch: %d ---> elapsed time: %f, quantization error: %f " %(i+1, round(time() - t1, 3),np.mean(np.sqrt(bmu[1] + X2))) setattr(self, 'codebook', New_Codebook_V) bmu[1] = np.sqrt(bmu[1] + X2) setattr(self, 'bmu', bmu)
import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from sklearn.calibration import CalibratedClassifierCV from sklearn.svm import LinearSVC from sklearn.externals import joblib # Read in data data = pd.read_csv('clean_data.csv') texts = data['text'].astype(str) y = data['is_offensive'] # Vectorize the text vectorizer = CountVectorizer(stop_words='english', min_df=0.0001) X = vectorizer.fit_transform(texts) # Train the model model = LinearSVC(class_weight="balanced", dual=False, tol=1e-2, max_iter=1e5) cclf = CalibratedClassifierCV(base_estimator=model) cclf.fit(X, y) # Save the model joblib.dump(vectorizer, 'vectorizer.joblib') joblib.dump(cclf, 'model.joblib')
# plt.ylabel('Loss') # plt.legend(loc='best') # plt.show() ############################# from sklearn import svm from sklearn import datasets clf = svm.SVC() iris = datasets.load_iris() x, y = iris.data, iris.target clf.fit(x, y) #method 1 : pickle # import pickle # # with open('save/clf.pickle','wb') as f: # # pickle.dump(clf,f) # # with open('save/clf.pickle','rb') as f: # clf2 = pickle.load(f) # print (clf2.predict(x[0:1])) # method 2 : joblib from sklearn.externals import joblib #Save joblib.dump(clf, 'save/clf.pkl') #resitore clf3 = joblib.load('save/clf.pkl') print(clf3.predict(x[0:1]))
result_value = round(predition_row[result_index], 2) if not results[real_result].get(result_value): results[real_result][result_value] = 0 results[real_result][result_value] += 1 for result in results.keys(): keys = sorted(results[result].keys(), reverse=True) current_sum = 0 for result_value in keys: value = results[result][result_value] current_sum += value results_file.write("%s\t%s\t%s\t%s%%\n" % (result, result_value, value, round(1.0 * current_sum / total[result], 2))) joblib.dump(clf, model_output_path) # Write used features feature_importance = clf.feature_importances_ std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0) indices = np.argsort(feature_importance)[::-1] base_name = model_output_path.split(".pkl")[0] with open(base_name + ".features", "w") as _file: for f in range(train[features].shape[1]): _file.write("%s\n" % features[indices[f]]) first_model_number = None found_final = False with open(base_name + ".recommended_features", "w") as _file:
end = nh3_co_end for i in range(len(start)): tmp_data = Db_seelct(start[i], end[i]) data = np.vstack((data, tmp_data)) for i in range(len(tmp_data)): y_train.append(class_bin[6]) x_train = reader.get_Ratio(data) y_train = np.array(y_train) # x_train = minmax_scale(x_train,model_dir,train=True) scaler = StandardScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) joblib.dump(scaler, model_dir+"scaler.pkl") modelTrain(x_train, y_train, save_name) #----------------------------------------------------------------------------------------------------------- y_valid=[] normal_start = [65600] normal_end = [65700] h2s_start = [63868] h2s_end = [63968] nh3_start = [64610] nh3_end = [64710]
train_clf(clf, X_train, y_train) print("F1 score for training set is: {:.4f}".format( pred_clf(clf, X_train, y_train))) print("F1 score for testing set is: {:.4f}\n".format( pred_clf(clf, X_test, y_test))) from sklearn.metrics import accuracy_score params = { 'max_depth': 9, 'subsample': 0.5, 'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 0 } gbc = GradientBoostingClassifier(n_estimators=290, **params) clf_ = gbc.fit(X_train, y_train) y_pred = clf_.predict(X_test) print('Accuracy is {}'.format(accuracy_score(y_test, y_pred))) train_predict(gbc, X_train, y_train, X_test, y_test) #pickling my model from sklearn.externals import joblib joblib.dump(gbc, 'model1.pkl') print("Model dumped!") model_columns = list(X.columns) joblib.dump(model_columns, 'model_columns.pkl') print("Model columns dumped")
#%% print(X_train[0].shape) X = X_train[0] for i in range(1,len(X_train)): X = np.vstack((X,X_train[i])) print(X.shape) #%% from sklearn.cluster import MiniBatchKMeans,KMeans cluster = MiniBatchKMeans(n_clusters=500, batch_size=50) # cluster = KMeans(n_clusters=500,random_state= 54, max_iter= 5000,n_jobs = 2) cluster.fit(X) from sklearn.externals import joblib joblib.dump(cluster, 'kmeans_model3.m') #%% km_model = joblib.load('kmeans_model3.m') #%% from sklearn import preprocessing def get_image_presentation(dense_sampling_images, centroids_model): image_presentation = np.zeros((len(dense_sampling_images), len(centroids_model.cluster_centers_))) histograms = [] count = 0 for image in dense_sampling_images: hist = centroids_model.predict(image) histograms.append(hist) for label in hist: image_presentation[count][label] += 1
def run(self): self.output().makedirs() df = pd.read_csv(self.input().path, usecols=[self.id_column, self.feature_name]) joblib.dump(df, self.output().path, compress=1)
def main(): if not (args.use_w1_w2_embeddings or args.use_paraphrase_vectors): raise ValueError( 'At least one of "use_w1_w2_embeddings" or "use_paraphrase_vectors" should be set.' ) # Load the datasets logger.info('Loading the datasets from {}'.format(args.dataset_prefix)) train_set = DatasetReader(args.dataset_prefix + '/train.tsv') val_set = DatasetReader(args.dataset_prefix + '/val.tsv', label2index=train_set.label2index) test_set = DatasetReader(args.dataset_prefix + '/test.tsv', label2index=train_set.label2index) # Generate the feature vectors using the paraphrasing model logger.info('Generating feature vectors...') train_features, val_features, test_features = [], [], [] if args.use_paraphrase_vectors: logger.info('Reading word embeddings from {}...'.format( args.word_embeddings_for_model)) wv, model_words = load_binary_embeddings( args.word_embeddings_for_model) logger.info('Loading paraphrasing model from {}...'.format( args.paraphrase_model_dir)) model = Model.load_model(args.language_model_dir, wv) model_words = ['[w1]', '[w2]', '[par]'] + model_words modelw2index = {w: i for i, w in enumerate(model_words)} UNK = modelw2index['unk'] if args.use_w1_w2_embeddings: logger.info('Reading word embeddings from {}...'.format( args.word_embeddings_for_dist)) wv, words = load_binary_embeddings(args.word_embeddings_for_dist) w2index = {w: i for i, w in enumerate(words)} UNK = w2index['unk'] train_features.append( np.vstack([ np.concatenate( [wv[w2index.get(w1, UNK), :], wv[w2index.get(w2, UNK), :]]) for (w1, w2) in train_set.noun_compounds ])) val_features.append( np.vstack([ np.concatenate( [wv[w2index.get(w1, UNK), :], wv[w2index.get(w2, UNK), :]]) for (w1, w2) in val_set.noun_compounds ])) test_features.append( np.vstack([ np.concatenate( [wv[w2index.get(w1, UNK), :], wv[w2index.get(w2, UNK), :]]) for (w1, w2) in test_set.noun_compounds ])) # Tune the hyper-parameters using the validation set logger.info('Classifying...') reg_values = [0.5, 1, 2, 5, 10] penalties = ['l2'] k_values = [10, 15, 25, 50] if args.use_paraphrase_vectors else [0] classifiers = ['logistic', 'svm'] f1_results = [] descriptions = [] models = [] all_test_instances = [] for k in k_values: curr_train_features, curr_val_features, curr_test_features = train_features, val_features, test_features if args.use_paraphrase_vectors: curr_train_features += [ predict_paraphrases(model, train_set.noun_compounds, model_words, modelw2index, UNK, k) ] curr_val_features += [ predict_paraphrases(model, val_set.noun_compounds, model_words, modelw2index, UNK, k) ] curr_test_features += [ predict_paraphrases(model, test_set.noun_compounds, model_words, modelw2index, UNK, k) ] train_instances = [ np.concatenate(list(f)) for f in zip(*curr_train_features) ] val_instances = [ np.concatenate(list(f)) for f in zip(*curr_val_features) ] test_instances = [ np.concatenate(list(f)) for f in zip(*curr_test_features) ] for cls in classifiers: for reg_c in reg_values: for penalty in penalties: descriptions.append( 'K: {}, Classifier: {}, Penalty: {}, C: {:.2f}'.format( k, cls, penalty, reg_c)) # Create the classifier if cls == 'logistic': classifier = LogisticRegression( penalty=penalty, C=reg_c, multi_class='multinomial', n_jobs=20, solver='sag') else: classifier = LinearSVC(penalty=penalty, dual=False, C=reg_c) logger.info( 'Training with classifier: {}, penalty: {}, c: {:.2f}...' .format(cls, penalty, reg_c)) classifier.fit(train_instances, train_set.labels) val_pred = classifier.predict(val_instances) p, r, f1, _ = evaluate(val_set.labels, val_pred, val_set.index2label, do_full_reoprt=False) logger.info( 'K: {}, Classifier: {}, penalty: {}, c: {:.2f}, precision: {:.3f}, recall: {:.3f}, F1: {:.3f}' .format(k, cls, penalty, reg_c, p, r, f1)) f1_results.append(f1) models.append(classifier) all_test_instances.append(test_instances) best_index = np.argmax(f1_results) description = descriptions[best_index] classifier = models[best_index] logger.info('Best hyper-parameters: {}'.format(description)) # Save the best model to a file logger.info('Copying the best model...') joblib.dump(classifier, '{}/best.pkl'.format(args.model_dir)) # Evaluate on the test set logger.info('Evaluation:') test_instances = all_test_instances[best_index] test_pred = classifier.predict(test_instances) precision, recall, f1, support = evaluate(test_set.labels, test_pred, test_set.index2label, do_full_reoprt=True) logger.info('Precision: {:.3f}, Recall: {:.3f}, F1: {:.3f}'.format( precision, recall, f1)) # Write the predictions to a file output_predictions(args.model_dir + '/predictions.tsv', test_set.index2label, test_pred, test_set.noun_compounds, test_set.labels)
for gene in lncRNA_names], dtype=float) #to keep order!!!!!!! y proteins = np.concatenate((hsapiens_info, atha_info), axis=0) y = [] for num in range(0, proteins.shape[0]): y.append(0) for num in range(0, lncRNA_info.shape[0]): y.append(1) y = np.asarray(y) X = np.concatenate((proteins, lncRNA_info), axis=0) print(y.shape) print(X.shape) X_normalized = preprocessing.normalize(X, norm='l2') clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.02, subsample=0.6, max_depth=10, random_state=339) clf.fit(X_normalized, y) # uncomment to save classifier as pickle joblib.dump(clf, 'model5.pkl')
from sklearn.linear_model import LinearRegression regression = LinearRegression() regression.fit(x_tr, y_tr) y_pred = regression.predict(x_te) plt.scatter(x_tr, y_tr, color='red') plt.plot(x_tr, regression.predict(x_tr), color='blue') plt.title('Seasonal Trend vs Price : Training Set') plt.xlabel('Start of Season Price') plt.ylabel('End of Season (Discounted) Price') plt.savefig('templates/retailvsdiscounted.jpg') plt.show() plt.scatter(x_te, y_te, color='red') plt.plot(x_tr, regression.predict(x_tr), color='blue') plt.title('Seasonal Trend vs Price : Training Set') plt.xlabel('Start of Season Price') plt.ylabel('End of Season (Discounted) Price') plt.savefig('templates/pricevdiscountedtest.jpg') plt.show() def getdiscount(price): print("The discounted value for given item : ", regression.predict(price)) getdiscount(5000) from sklearn.externals import joblib joblib.dump(regression, 'linmodel.pkl')
def predict(config_file=None, data_dirs=None, annotation_file=None, file_format=None, model_meta_file=None, output_dir=None, segment=None, predict_proba=False, convert_to=None, return_predictions=True): """high-level function for prediction of syllable labels. Accepts either a config file or a set of parameters and uses them to predict labels for syllable segments in audio files, based on features extracted from those segments. Parameters ---------- config_file : string filename of YAML file that configures label prediction data_dirs : list of str, directories that contain audio files from which features should be extracted. hvc.extract attempts to create an annotation.csv file based on the audio file types in the directories. annotation_file : str filename of an annotation.csv file file_format : str format of audio files. One of the following: {'cbin','wav'} model_meta_file : str filename of .meta file for classifier to use. output_dir : str Name of parent directory in which to create output. If parent directory does not exist, it is created. Default is current working directory. segment : bool if True, segment song. If annotation file is passed as an argument, then segments from that file are used. If data_dirs is passed as an argument, and segment is False, then the FeatureExtractor will look for annotation files, and will raise an error if none are found. Default when data_dirs is passed as an argument is True, i.e. it is assumed the user has not already segmented the song and wants to do this in an automated way, then apply classifiers to the segments. predict_proba : bool If True, estimate probabilities that labels are correct. Default is False. convert_to: str If True, convert predictions to annotation files. Default is False. return_predictions : bool If True, return feature file with predicted labels. Default is True. Returns ------- predictions : dict feature file returned as a Python dictionary, with the additional (key, value) pair of 'pred_labels', a Numpy array containing the labels predicted by the classifier. Only returned if return_predictions = True. """ if config_file and (file_format or model_meta_file or output_dir or segment or convert_to): raise ValueError('Cannot specify config_file and other parameters ' 'when calling hvc.predict, ' 'please specify either config_file or all other ' 'parameters ') if config_file and data_dirs: raise ValueError('Please specify either config_file or data_dirs, ' 'not clear which to use when both are specified') if config_file and annotation_file: raise ValueError('Please specify either config_file or annotation_file, ' 'not clear which to use when both are specified') home_dir = os.getcwd() if config_file: predict_config = hvc.parseconfig.parse_config(config_file, 'predict') print('parsed predict config') for todo in predict_config['todo_list']: # get absolute path before changing directories # in case user specified output as a relative dir output_dir = os.path.abspath(todo['output_dir']) output_dir = os.path.join(output_dir, 'predict_output_' + hvc.utils.timestamp()) if not os.path.isdir(output_dir): os.mkdir(output_dir) extract_params = { 'output_dir': output_dir, 'data_dirs': todo['data_dirs'], 'labels_to_use': 'all', 'file_format': todo['file_format'] } model_meta_file = joblib.load(todo['model_meta_file']) feature_file_for_model = model_meta_file['feature_file'] print('loading feature file') feature_file = joblib.load(feature_file_for_model) feature_extractor = feature_file['feature_extractor'] print('extracting features') feature_extractor.extract(**extract_params, segment=True, make_output_subdir=False) os.chdir(output_dir) ftr_files = glob('features_created*') model_filename = model_meta_file['model_filename'] model_name = model_meta_file['model_name'] if model_name in valid_models['sklearn']: clf = joblib.load(model_filename) scaler = model_meta_file['scaler'] elif model_name in valid_models['keras']: if 'keras.models' not in locals(): import keras.models clf = keras.models.load_model(model_filename) spect_scaler = model_meta_file['spect_scaler'] for ftr_file in ftr_files: print("predicting labels for features in file: {}" .format(ftr_file)) ftr_file_dict = joblib.load(ftr_file) if model_name in valid_models['sklearn']: features = ftr_file_dict['features'] if np.any(np.isnan(features)): # if any rows have nan values for features features_has_nan = True # Initialize predictions vector, to later assign nan values # to predictions for those rows pred_labels_nan = np.full((features.shape[0],), 'nan') # has to be same dtype as predictions # Need to remove rows with nans before normalization + classification features_not_nan_rows = np.where(~np.isnan(features).any(axis=1))[0] features = features[features_not_nan_rows, :] else: features_has_nan = False features_scaled = scaler.transform(features) pred_labels = clf.predict(features_scaled) if features_has_nan: # index pred_labels into pred_labels_nan # so that all nan rows will have 'nan' as prediction pred_labels_nan[features_not_nan_rows] = pred_labels # now make pred_labels point to ndarray with 'nan' predictions included pred_labels = pred_labels_nan elif model_name in valid_models['keras']: neuralnet_inputs_dict = ftr_file_dict['neuralnet_inputs'] inputs_key = model_meta_file['feature_list'][0] neuralnet_inputs = neuralnet_inputs_dict[inputs_key] neuralnet_inputs_scaled = spect_scaler.transform(neuralnet_inputs) neuralnet_inputs_scaled = neuralnet_inputs_scaled[:, :, :, np.newaxis] pred_labels = clf.predict(neuralnet_inputs_scaled) label_binarizer = model_meta_file['label_binarizer'] pred_labels = label_binarizer.inverse_transform(pred_labels) ftr_file_dict['pred_labels'] = pred_labels if 'predict_proba' in todo: if todo['predict_proba']: pred_probs = clf.predict_proba(features_scaled) ftr_file_dict['pred_probs'] = pred_probs joblib.dump(ftr_file_dict, ftr_file) if 'convert' in todo: songfiles = ftr_file_dict['songfiles'] songfile_IDs = ftr_file_dict['songfile_IDs'] if todo['convert'] == 'notmat': all_sampfreqs = ftr_file_dict['all_sampfreqs'] print('converting to .not.mat files') for curr_song_id, songfile_name in enumerate(songfiles): these = np.asarray(songfile_IDs) == curr_song_id segment_params = ftr_file_dict['segment_params'] annotation.make_notmat(filename=songfile_name, labels=ftr_file_dict['pred_labels'][these], onsets_Hz=ftr_file_dict['onsets_Hz'][these], offsets_Hz=ftr_file_dict['offsets_Hz'][these], samp_freq=all_sampfreqs[curr_song_id], threshold=segment_params['threshold'], min_syl_dur=segment_params['min_syl_dur'], min_silent_dur=segment_params['min_silent_dur'], clf_file=model_filename, alternate_path=output_dir) os.chdir(home_dir) elif data_dirs or annotation_file: if data_dirs and annotation_file: raise ValueError('hvc.predict received values for both data_dirs and ' 'annotation_file arguments, unclear which to use. ' 'Please only specify one or the other.') if model_meta_file is None: raise ValueError('model_meta_file required when as an argument when hvc.predict ' 'is called with data_dirs or annotation_file.') if convert_to is not None: if convert_to not in valid_convert_types: raise ValueError('file format to convert predictions to, {}, is not a ' 'valid format'.format(convert_to)) if segment is None: # default to True if data_dirs and (annotation_file is None): segment = True else: segment = False model_meta_file = joblib.load(model_meta_file) model_filename = model_meta_file['model_filename'] model_name = model_meta_file['model_name'] if predict_proba: if model_name not in valid_models['sklearn']: raise ValueError('predict_proba argument set to True, but model in {} is {}, ' 'which is not a valid scikit-learn model and does not have ' 'a predict probability function built in'.format(model_filename, model)) if output_dir is None: output_dir = os.getcwd() output_dir = os.path.abspath(output_dir) output_dir = os.path.join(output_dir, 'predict_output_' + hvc.utils.timestamp()) if not os.path.isdir(output_dir): os.mkdir(output_dir) feature_file_for_model = model_meta_file['feature_file'] print('loading feature file') feature_file = joblib.load(feature_file_for_model) extract_params = { 'output_dir': output_dir, 'labels_to_use': 'all', 'file_format': file_format, 'segment': segment } if annotation_file: extract_params['annotation_file'] = annotation_file elif data_dirs: extract_params['data_dirs'] = data_dirs feature_extractor = feature_file['feature_extractor'] print('extracting features') feature_extractor.extract(**extract_params, make_output_subdir=False) os.chdir(output_dir) ftr_files = glob('features_created*') if model_name in valid_models['sklearn']: clf = joblib.load(model_filename) scaler = model_meta_file['scaler'] elif model_name in valid_models['keras']: if 'keras.models' not in locals(): import keras.models clf = keras.models.load_model(model_filename) spect_scaler = model_meta_file['spect_scaler'] for ftr_file in ftr_files: print("predicting labels for features in file: {}" .format(ftr_file)) ftr_file_dict = joblib.load(ftr_file) if model_name in valid_models['sklearn']: features = ftr_file_dict['features'] if np.any(np.isnan(features)): # if any rows have nan values for features features_has_nan = True # Initialize predictions vector, to later assign nan values # to predictions for those rows pred_labels_nan = np.full((features.shape[0],), 'nan') # has to be same dtype as predictions # Need to remove rows with nans before normalization + classification features_not_nan_rows = np.where(~np.isnan(features).any(axis=1))[0] features = features[features_not_nan_rows, :] else: features_has_nan = False features_scaled = scaler.transform(features) pred_labels = clf.predict(features_scaled) if features_has_nan: # index pred_labels into pred_labels_nan # so that all nan rows will have 'nan' as prediction pred_labels_nan[features_not_nan_rows] = pred_labels # now make pred_labels point to ndarray with 'nan' predictions included pred_labels = pred_labels_nan elif model_name in valid_models['keras']: neuralnet_inputs_dict = ftr_file_dict['neuralnet_inputs'] inputs_key = model_meta_file['feature_list'][0] neuralnet_inputs = neuralnet_inputs_dict[inputs_key] neuralnet_inputs_scaled = spect_scaler.transform(neuralnet_inputs) neuralnet_inputs_scaled = neuralnet_inputs_scaled[:, :, :, np.newaxis] pred_labels = clf.predict(neuralnet_inputs_scaled) label_binarizer = model_meta_file['label_binarizer'] pred_labels = label_binarizer.inverse_transform(pred_labels) ftr_file_dict['pred_labels'] = pred_labels if predict_proba: pred_probs = clf.predict_proba(features_scaled) ftr_file_dict['pred_probs'] = pred_probs joblib.dump(ftr_file_dict, ftr_file) if convert_to: songfiles = ftr_file_dict['songfiles'] songfile_IDs = ftr_file_dict['songfile_IDs'] if convert_to == 'notmat': all_sampfreqs = ftr_file_dict['all_sampfreqs'] print('converting to .not.mat files') for curr_song_id, songfile_name in enumerate(songfiles): these = np.asarray(songfile_IDs) == curr_song_id segment_params = ftr_file_dict['segment_params'] annotation.make_notmat(filename=songfile_name, labels=ftr_file_dict['pred_labels'][these], onsets_Hz=ftr_file_dict['onsets_Hz'][these], offsets_Hz=ftr_file_dict['offsets_Hz'][these], samp_freq=all_sampfreqs[curr_song_id], threshold=segment_params['threshold'], min_syl_dur=segment_params['min_syl_dur'], min_silent_dur=segment_params['min_silent_dur'], clf_file=model_filename, alternate_path=output_dir) if return_predictions: predict_dict = {} for ftr_file in ftr_files: ftrs = joblib.load(ftr_file) if predict_dict == {}: predict_dict['labels'] = ftrs['labels'] predict_dict['pred_labels'] = ftrs['pred_labels'] predict_dict['songfile_IDs'] = ftrs['songfile_IDs'] predict_dict['onsets_Hz'] = ftrs['onsets_Hz'] predict_dict['offsets_Hz'] = ftrs['offsets_Hz'] predict_dict['songfiles'] = ftrs['songfiles'] predict_dict['feature_list'] = ftrs['feature_list'] predict_dict['labels_to_use'] = ftrs['labels_to_use'] if 'features' in ftrs: predict_dict['features'] = ftrs['features'] predict_dict['features_arr_column_IDs'] = ftrs['features_arr_column_IDs'] if 'feature_group_ID_dict' in ftrs: predict_dict['feature_group_ID_dict'] = ftrs['feature_group_ID_dict'] predict_dict['feature_list_group_ID'] = ftrs['feature_list_group_ID'] if 'pred_probs' in ftrs: predict_dict['pred_probs'] = ftrs['pred_probs'] if 'neuralnet_inputs' in ftrs: predict_dict['neuralnet_inputs'] = ftrs['neuralnet_inputs'] else: # if we already loaded one feature file and predict_dict is not empty # then concatenate predict_dict['labels'] = np.concatenate(predict_dict['labels'], ftrs['labels']) predict_dict['pred_labels'] = np.concatenate(predict_dict['pred_labels'], ftrs['pred_labels']) predict_dict['songfile_IDs'] = np.concatenate(predict_dict['songfile_IDs'], ftrs['songfile_IDs']) predict_dict['onsets_Hz'] = np.concatenate(predict_dict['onsets_Hz'], ftrs['onsets_Hz']) predict_dict['offsets_Hz'] = np.concatenate(predict_dict['offsets_Hz'], ftrs['offsets_Hz']) if 'features' in predict_dict: predict_dict['features'] = np.concatenate(predict_dict['features'], ftrs['features']) if 'neuralnet_inputs' in predict_dict: for key, val in ftrs['neuralnet_input']: predict_dict['neuralnet_inputs'][key] = \ np.concatenate((predict_dict['neuralnet_inputs'][key], ftrs['neuralnet_inputs'][key])) if 'pred_probs' in predict_dict: predict_dict['pred_probs'] = np.concatenate(predict_dict['pred_probs'], ftrs['pred_probs']) os.chdir(home_dir) return predict_dict else: os.chdir(home_dir)
verbose=2, ) #train the model model.fit(X_train, Y_train) #test the model from sklearn.metrics import accuracy_score, log_loss Y_pred = model.predict(X_train) Y_prob = model.predict_proba(X_train) print( 'The accuracy obtained for train data is {:.4f} and the cross entropy is {:.4f}' .format(accuracy_score(Y_train, Y_pred), log_loss(Y_train, Y_prob))) Y_pred = model.predict(X_test) Y_prob = model.predict_proba(X_test) print( 'The accuracy obtained for test data is {:.4f} and the cross entropy is {:.4f}' .format(accuracy_score(Y_test, Y_pred), log_loss(Y_test, Y_prob))) #metrics of the model from sklearn.metrics import classification_report, confusion_matrix names = ['case' + str(s) for s in range(0, ncases)] print(classification_report(Y_test, Y_pred, target_names=names)) print(confusion_matrix(Y_test, Y_pred)) #save the model from sklearn.externals import joblib joblib.dump(model, 'analysis/models/neuralnetwork.pkl') joblib.dump(scaler, 'analysis/models/neuralnetwork_scaler.pkl')
def _train_and_analyze_predictors(self): log.info('Training and analyzing predictors...') problem = SupervisedLearningPipeline.CLASSIFICATION meta_report = None fm_io = FeatureMatrixIO() # Build paths for output. pipeline_file_name = inspect.getfile(inspect.currentframe()) data_dir = SupervisedLearningPipeline._fetch_data_dir_path( self, pipeline_file_name) # Test BifurcatedSupervisedClassifier and SupervisedClassifier. algorithms_to_test = list() algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS) # for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS: # algorithms_to_test.append('bifurcated-%s' % algorithm) log.debug('algorithms_to_test: %s' % algorithms_to_test) # Train and analyse algorithms. for algorithm in algorithms_to_test: log.info('Training and analyzing %s...' % algorithm) # If report_dir does not exist, make it. report_dir = '/'.join([data_dir, algorithm]) if not os.path.exists(report_dir): os.makedirs(report_dir) # Define hyperparams. hyperparams = {} hyperparams['algorithm'] = algorithm hyperparams[ 'hyperparam_strategy'] = SupervisedClassifier.EXHAUSTIVE_SEARCH hyperparams['max_iter'] = 1024 hyperparams['random_state'] = self._random_state # If bifurcated algorithm, define bifurcator. if 'bifurcated' in algorithm: # bifrucator = LAB.pre == 0 hyperparams['bifurcator'] = '%s.pre' % self._var hyperparams[ 'bifurcation_strategy'] = BifurcatedSupervisedClassifier.EQUAL hyperparams['bifurcation_value'] = 0 hyperparams['bifurcated'] = True # Train classifier. predictor_path = self._build_model_dump_path(algorithm) if os.path.exists( predictor_path) and 'bifurcated' not in algorithm: log.debug('Loading model from disk...') # TODO(sbala): Fix loblib.load so that it works for bifurcated # supervised classifiers. self._predictor = joblib.load(predictor_path) self._features = self._X_train.columns status = SupervisedClassifier.TRAINED else: status = SupervisedLearningPipeline._train_predictor( self, problem, [0, 1], hyperparams) # If failed to train, write an error report. y_train_counts = self._y_train[ self._y_train.columns[0]].value_counts() y_test_counts = self._y_test[ self._y_test.columns[0]].value_counts() if status == SupervisedClassifier.INSUFFICIENT_SAMPLES: # Skip all analysis and reporting. # This will be true for all algorithms, so just return. # Build error report. algorithm_report = DataFrame( { 'lab_panel': [self._var], 'algorithm': [algorithm], 'error': [status], 'y_train.value_counts()': [y_train_counts.to_dict()], 'y_test.value_counts()': [y_test_counts.to_dict()] }, columns=[ 'lab_panel', 'algorithm', 'error', 'y_train.value_counts()', 'y_test.value_counts()' ]) header = [ 'LabNormalityPredictionPipeline("%s", 10000)' % self._var ] # Write error report. fm_io.write_data_frame_to_file(algorithm_report, \ '/'.join([report_dir, '%s-normality-prediction-report.tab' % (self._var)]), \ header) # If successfully trained, append to a meta report. elif status == SupervisedClassifier.TRAINED: pipeline_prefix = '%s-normality-prediction-%s' % (self._var, algorithm) SupervisedLearningPipeline._analyze_predictor( self, report_dir, pipeline_prefix) SupervisedLearningPipeline._analyze_predictor_traindata( self, report_dir) # continue # Do not generate stats results here... if meta_report is None: meta_report = fm_io.read_file_to_data_frame('/'.join( [report_dir, '%s-report.tab' % pipeline_prefix])) else: algorithm_report = fm_io.read_file_to_data_frame('/'.join( [report_dir, '%s-report.tab' % pipeline_prefix])) log.debug('algorithm_report: %s' % algorithm_report) meta_report = meta_report.append(algorithm_report) # Write predictor to disk. predictor = SupervisedLearningPipeline.predictor(self) predictor_path = self._build_model_dump_path(algorithm) joblib.dump(predictor, predictor_path) # After building per-algorithm reports, write to meta report. # Note that if there were insufficient samples to build any of the # algorithms, then meta_report will still be None. if meta_report is not None: header = [ 'LabNormalityPredictionPipeline("%s", 10000)' % self._var ] fm_io.write_data_frame_to_file(meta_report, \ '/'.join([data_dir, '%s-normality-prediction-report.tab' % self._var]), header)
def main(): train_images, val_images = [], [] train_folders = train_data val_folders = test_data if TESTING else val_data if not TRAINING: if len(sys.argv) != 2: print( "ERROR! Usage: python3 train.py *load_directory (if validating or testing)*" ) return load_dir = './log/train/' + sys.argv[1] svm = joblib.load(load_dir + '/svm.pkl') kmeans = joblib.load(load_dir + '/kmeans.pkl') SURF_params = get_surf_params(load_dir + '/params.txt') if TRAINING: print("Getting train data...") train_labels_init = get_data(train_images, train_folders, class_labels_raw) else: print("Getting val/test data...") val_labels_init = get_data(val_images, val_folders, class_labels_raw) print("Getting SURF (train)...") if TRAINING: train_kps, train_descriptors = get_surf_features( train_images, hessian_threshold=hessian_threshold, upright=upright, extended=extended) else: print("Getting SURF (val/test)...") # print("hessian_threshold?: {}".format(int(SURF_params["hessian_threshold"]))) # print("upright?: {}".format(str2bool(SURF_params["upright"]))) # print("extended?: {}".format(str2bool(SURF_params["extended"]))) val_kps, val_descriptors = get_surf_features( val_images, hessian_threshold=int(SURF_params["hessian_threshold"]), upright=str2bool(SURF_params["upright"]), extended=str2bool(SURF_params["extended"])) print("Rearranging data....") if TRAINING: train_descriptors_np_init = [ np.array(curr_des) for curr_des in train_descriptors ] else: val_descriptors_np_init = [ np.array(curr_des) for curr_des in val_descriptors ] train_descriptors_np, val_descriptors_np = [], [] train_labels, val_labels = [], [] if TRAINING: print("Clearing out invalid train") for i in range(len(train_descriptors_np_init)): if len(train_descriptors_np_init[i].shape) == 2: train_descriptors_np.append(train_descriptors_np_init[i]) train_labels.append(train_labels_init[i]) else: print("Clearing out invalid val/test") for i in range(len(val_descriptors_np_init)): if len(val_descriptors_np_init[i].shape) == 2: val_descriptors_np.append(val_descriptors_np_init[i]) val_labels.append(val_labels_init[i]) if TRAINING: print("Stacking train") train_descriptor_matrix = np.vstack(train_descriptors_np) train_image_index = np.ones( (train_descriptor_matrix.shape[0], ), dtype=int) * -1 else: print("Stacking val/test") val_descriptor_matrix = np.vstack(val_descriptors_np) val_image_index = np.ones( (val_descriptor_matrix.shape[0], ), dtype=int) * -1 if TRAINING: print("Indexer train") curr_pos = 0 diff = 0 for i in range(len(train_images)): if train_descriptors[i] is None: diff += 1 continue next_pos = curr_pos + len(train_descriptors[i]) train_image_index[curr_pos:next_pos] = i - diff curr_pos = next_pos else: print("Indexer val/test") curr_pos = 0 diff = 0 for i in range(len(val_images)): if val_descriptors[i] is None: diff += 1 continue next_pos = curr_pos + len(val_descriptors[i]) val_image_index[curr_pos:next_pos] = i - diff curr_pos = next_pos if TRAINING: print("Running k-means") kmeans = MiniBatchKMeans( n_clusters=vocab_size, precompute_distances=precompute_distances, verbose=kmeans_verbose, # n_jobs=n_jobs, # num CPUs, -1 -> all CPUs # algorithm=algorithm ).fit(train_descriptor_matrix) train_centroid_labels = kmeans.labels_ centroids = kmeans.cluster_centers_ else: print("Getting centroid labels (val/test)") val_centroid_labels = kmeans.predict(val_descriptor_matrix) print("Building image histograms") if TRAINING: train_features = np.zeros((len(train_labels), vocab_size), dtype=int) for i in range(len(train_centroid_labels)): img_ind = train_image_index[i] train_features[img_ind, train_centroid_labels[i]] += 1 else: val_features = np.zeros( (len(val_labels), int(SURF_params['vocab_size'])), dtype=int) for i in range(len(val_centroid_labels)): img_ind = val_image_index[i] val_features[img_ind, val_centroid_labels[i]] += 1 now = datetime.datetime.now(dateutil.tz.tzlocal()) timestamp = now.strftime('%Y_%m_%d_%H_%M') if TRAINING: SVM_params = [(0.01, "linear"), (0.1, "linear"), (0.5, "linear"), (1.0, "linear"), (0.01, "rbf"), (0.1, "rbf"), (0.5, "rbf"), (1.0, "rbf")] for param_set in SVM_params: C, kernel = param_set print("Training SVM") svm = SVC(C=C, kernel=kernel, verbose=svm_verbose, shrinking=shrinking) svm.fit(train_features, train_labels) pred_train = svm.predict(train_features) train_diffs = pred_train - train_labels train_error = len( np.nonzero(train_diffs)[0]) / (train_diffs.shape[0]) print("Training error is: {}".format(train_error)) print("Saving confusion matrix (train)...") confMat_train = confusion_matrix(train_labels, pred_train) print(confMat_train) plt.matshow(confMat_train) plt.show() print_params(C, kernel) print("Logging data to:") cwd = os.getcwd() directory = cwd + "/log/train/" + "reg_" + str( param_set[0]) + "_kernel_" + param_set[1] + timestamp print(directory) svm_filename = directory + '/svm.pkl' kmeans_filename = directory + '/kmeans.pkl' params_filename = directory + '/params.txt' train_confmat_file = directory + '/confmat_train.npy' os.makedirs(directory) joblib.dump(svm, svm_filename) joblib.dump(kmeans, kmeans_filename) np.save(train_confmat_file, confMat_train) save_params(params_filename, C, kernel) else: pred_val = svm.predict(val_features) val_diffs = pred_val - val_labels val_error = len(np.nonzero(val_diffs)[0]) / (val_diffs.shape[0]) print("Val error is: {}".format(val_error)) print("Saving confusion matrix (val)...") confMat_val = confusion_matrix(val_labels, pred_val) print(confMat_val) plt.matshow(confMat_val) plt.show() cwd = os.getcwd() curr = 'val' if VALIDATING else 'test' directory = cwd + '/log/' + curr + '/' + timestamp os.makedirs(directory) val_confmat_file = directory + '/confmat.npy' np.save(val_confmat_file, confMat_val)
from keras import backend as K K.set_image_data_format('channels_last') (X_train, y_train), (X_test, y_test) = cifar10.load_data() ## Convert images to grayscale import cv2 X_train = map(lambda x: cv2.cvtColor(x, cv2.COLOR_BGRA2GRAY), X_train) X_test = map(lambda x: cv2.cvtColor(x, cv2.COLOR_BGRA2GRAY), X_test) ## Fit from sklearn import svm import numpy clf = svm.SVC(gamma=0.001, C=100.) X_train = numpy.array(X_train) X_train = numpy.array(X_train).reshape((X_train.shape[0], -1), order='F') y_train = numpy.array(y_train).flatten() clf.fit(X_train, y_train) ## Save classifier to disk from sklearn.externals import joblib joblib.dump(clf, 'clasifiers/svn-classifier.pkl')
plt.figure() plt.plot(range(len(y_pred[120:250])), y_pred[120:250], 'b', label="price_predict") plt.plot(range(len(y_pred[120:250])), y_test[120:250], 'r', label="price_test") plt.legend(loc="upper right") plt.show() #随机森林 try: clf = joblib.load('random.h5') # 加载API except: clf = RandomForestClassifier() clf.fit(X_train, y_train) joblib.dump(clf, "random.h5") # 保存 y_pred2 = clf.predict(X_test) plt.figure() plt.plot(range(len(y_pred2[120:250])), y_pred2[120:250], 'b', label="price_predict") plt.plot(range(len(y_pred2[120:250])), y_test[120:250], 'r', label="price_test") plt.legend(loc="upper right") plt.show() #朴素贝叶斯 try: mnb = joblib.load('mnb.pkl')
def MLR_model(Training_features, Training_targets, Testing_features, Testing_targets, stock_limit, Price_table): max_profit_to_loss = -10000000000000000 min_error = 1 for L1 in [40, 80]: for L2 in [20]: for L3 in [10]: for alpha in [0.001]: MLPR = MLPRegressor(activation='relu', hidden_layer_sizes=(L1, L2, L3), solver='lbfgs', alpha=alpha, batch_size=50, learning_rate='adaptive', max_iter=2000000, early_stopping=True) MLPR.fit(Training_features, Training_targets) Train_predict = MLPR.predict(Training_features) predict_price = [] for q in range(len(Testing_features)): predict_price.append( MLPR.predict(Testing_features[q].reshape(1, -1))) error = np.mean( abs( np.array(Train_predict) - np.array(Training_targets)) / np.array(Training_targets)) # Calculate profit stock = 0 loss = 0 brought = 0 profit = 0 # For plotting t = list(range(len(Testing_targets))) BP = [] B = [] SP = [] S = [] for i in range(6, len(predict_price)): if (i % 1 == 0): current_price = Price_table[i] if ( stock <= predict_price[i] and stock > 0 ): # Sell the stock if you have it as soon as hiting a down trend if ((current_price - stock) >= 0): profit += (current_price - stock) else: loss += abs(current_price - stock) stock = 0 brought = 0 SP.append(t[i]) S.append(current_price) elif ( current_price <= predict_price[i] and stock == 0 ): # Buy stock at the bottom of the down trend stock = current_price brought = 1 BP.append(t[i]) B.append(current_price) if (loss == 0): loss = 0.0001 if (profit / loss >= max_profit_to_loss): model = MLPR max_profit = profit max_profit_to_loss = profit / loss prices = predict_price Buy_points = BP Buys = B Sell_points = SP Sells = S # Save optimal model joblib.dump(model, 'MLPR.pkl') plot_data = [[t, Price_table, 'm'], [np.array(t), prices, 'b'], [Buy_points, Buys, 'ro'], [Sell_points, Sells, 'go']] metric = max_profit_to_loss if (metric >= 10000): metric /= 15000 return [max_profit, metric, plot_data]
def get_weather(df, how='learning', freq=None): """ Match timeseries with weather data. df : [Dataframe] If type == learning : Matching with historitical data weather if type == forcast : Matching with forcast data. Freq must be fill with this opton freq : Timedelta ex : "1H" """ df = df.reset_index() # Check params if how not in ['learning', 'forecast']: logger.error( 'Bad option for get_weather. You must choose between learning or forecast' ) return df if how == 'forecast' and freq is None: logger.error("For forecast option, we must specify freq. Ex freq='1H'") # Process for learning matching if how == 'learning': lyon_meteo = pd.read_csv('data/lyon_weather.csv', parse_dates=['date']) lyon_meteo.rename(columns={'date': 'ts'}, inplace=True) # have to labelencode weather_desc LE = LabelEncoder() lyon_meteo['weather_desc'] = LE.fit_transform( lyon_meteo['weather_desc']) # Dump LabelEncoder joblib.dump(LE, 'model/Label_Encoder_Weather.pkl') # Resemple data on 10 clean_lyon_meteo = lyon_meteo.resample( "10T", on="ts").mean().bfill().reset_index() df = df.merge(clean_lyon_meteo[[ 'ts', 'temp', 'humidity', 'weather_desc', 'cloudiness' ]], on='ts', how='left') df = df.sort_index() df = df.set_index('ts') return df # Process for forecast matching if how == 'forecast': lyon_forecast = pd.read_csv('data/lyon_forecast.csv', parse_dates=['forecast_at', 'ts']) lyon_forecast[ 'delta'] = lyon_forecast['ts'] - lyon_forecast['forecast_at'] # Filter on delta with freq lyon_forecast = lyon_forecast[lyon_forecast['delta'] == freq] lyon_forecast.drop_duplicates(subset=['ts', 'delta'], keep='first', inplace=True) # Label encode weather_desc LE = joblib.load('model/Label_Encoder_Weather.pkl') lyon_forecast['weather_desc'] = LE.transform( lyon_forecast['weather_desc']) #Merging # We take the last forecast (on freq) using backward merging df = df.sort_values('ts') df_index_save = df.index # Savind index merge will destroy it df = pd.merge_asof(left=df, right=lyon_forecast[[ 'ts', 'temp', 'humidity', 'weather_desc', 'cloudiness' ]], on='ts', direction='backward') df.index = df_index_save # Resorting as originaly (to don't loose y_test order) df = df.sort_index() df = df.set_index('ts') return df
# -*- coding:utf-8 -*- from sklearn.cluster import KMeans from retrieval import load_feat_db from sklearn.externals import joblib from myconfig import DATASET_BASE, N_CLUSTERS import os if __name__ == '__main__': feats, color_feats, labels = load_feat_db() model = KMeans(n_clusters=N_CLUSTERS, random_state=0, n_jobs=-1).fit(feats) model_path = os.path.join(DATASET_BASE, r'models', r'kmeans.m') joblib.dump(model, model_path)
def save(self): self.active=True joblib.dump(self.learn,self.name_file)
long_status=0 start_bar=start_bar+1 print('operazioni in Profit :'+ str(round(operations_in_profit))+" guadagno $ "+str(round(operations_in_profit*tp_factor))) print('operazioni in Loss :'+ str(round(operations_in_loss))+ " perdita $ "+ str(round(operations_in_loss))) df_equity=pd.DataFrame(columns=["EquityLine"]) df_equity['EquityLine']=equity_curve EquityLine=df_equity['EquityLine'] CloseLine=EquityLine.plot() from sklearn.externals import joblib joblib.dump(modelSVC, 'modelEURUSD0-35kbars.pkl') #DISTRIBUZIONE DEI BODI NELLE CANDELE DI MINIMO bodyGraph=df_Lfail['Shadow_n'].plot.hist(title='body size',bins=50,alpha=0.5,color='blue',normed=True) #bodyGraph2=df_serie['Body'].plot.hist(title='body size',bins=300,alpha=0.2,color='red', normed=True) # PREPARO DF VALIDAZIONE COME DF BT CIOE CON INDEX DATE OHLC NEX E PAST E BODY #LEVO LA COLONNA VOLUME CHE NON SERVE df_validazione=df_validazione.drop(['Volume'],axis=1) df_validazione=df_validazione.reset_index(drop=True) ## CREA LE COLONNE OHLC NEXT E PAST PER AVERE SU OGNI ROW IL PATTERN A TRE BARRE COMPLETO #raccogli i dati della barra seguente, cosi quando estrapolo min e massimi ho i dati per vedere correlazioni e pattern a 3 barre df_validazione['Open_n']=df_validazione["Open"].shift(-1) df_validazione['High_n']=df_validazione["High"].shift(-1)
def train_model(feats_csv): df = pd.DataFrame() df = pd.read_csv(feats_csv).iloc[:,1:] y = np.ravel(df.iloc[:,-1:]) X = np.array(df.iloc[:,:-1]) ############ 15 Best selected features using ANOVA F-value score function ############### X_new = SelectKBest(f_classif, k=15).fit_transform(X, y) selected_features = SelectKBest(f_classif, k=15).fit(X, y).get_support(indices = True) ############ KNN manhattan ############### ##### preprocessing: data scaling######## min_max_scaler = MinMaxScaler() X_new = min_max_scaler.fit_transform(X_new) model = KNeighborsClassifier(n_neighbors = 1,algorithm = 'brute',metric = 'manhattan',weights = 'uniform') model.fit(X_new,y) newdir = '../kNN_clfr' os.mkdir(newdir) joblib.dump(model, os.path.join(newdir,'kNN.pkl')) return