def train_svm(x_train, y_train, x_valid, y_valid, loss, penalty, alpha, n_iter): full_svm_classifier = SGDClassifier(loss=loss, penalty=penalty, alpha=alpha, verbose=True, class_weight='balanced', n_iter=n_iter, learning_rate="optimal") full_svm_classifier.fit(x_train, y_train) cm = confusion_matrix(y_valid,full_svm_classifier.predict(x_valid)) accuracy_negative = cm[0,0] / np.sum(cm[0,:]) accuracy_positive = cm[1,1] / np.sum(cm[1,:]) precision = cm[1,1] / (cm[1,1] + cm[0,1]) recall = cm[1,1] / (cm[1,1] + cm[1,0]) f1_score = 2 * precision * recall / (precision + recall) print(accuracy_positive,accuracy_negative,precision,recall,f1_score) return full_svm_classifier
def test_underflow_or_overlow(): with np.errstate(all="raise"): # Generate some weird data with hugely unscaled features rng = np.random.RandomState(0) n_samples = 100 n_features = 10 X = rng.normal(size=(n_samples, n_features)) X[:, :2] *= 1e300 assert_true(np.isfinite(X).all()) # Use MinMaxScaler to scale the data without introducing a numerical # instability (computing the standard deviation naively is not possible # on this data) X_scaled = MinMaxScaler().fit_transform(X) assert_true(np.isfinite(X_scaled).all()) # Define a ground truth on the scaled data ground_truth = rng.normal(size=n_features) y = (np.dot(X_scaled, ground_truth) > 0.0).astype(np.int32) assert_array_equal(np.unique(y), [0, 1]) model = SGDClassifier(alpha=0.1, loss="squared_hinge", n_iter=500) # smoke test: model is stable on scaled data model.fit(X_scaled, y) assert_true(np.isfinite(model.coef_).all()) # model is numerically unstable on unscaled data msg_regxp = ( r"Floating-point under-/overflow occurred at epoch #.*" " Scaling input data with StandardScaler or MinMaxScaler" " might help." ) assert_raises_regexp(ValueError, msg_regxp, model.fit, X, y)
def algo(a): global data global week target = data['target'] data = data[["id", "cpu", "creator", "dbs" , "dtype" , "era" , "nblk" , "nevt" , "nfiles" , "nlumis" , "nrel" , "nsites" , "nusers" , "parent" , "primds" , "proc_evts" , "procds" , "rnaccess" , "rnusers" , "rtotcpu" , "size" , "tier" , "totcpu" , "wct", "naccess"]] week['target'] = 0 week['target'] = week.apply(convert, axis=1) week['target'] = week['target'].astype(int) test1 = week week = week[["id", "cpu", "creator", "dbs" , "dtype" , "era" , "nblk" , "nevt" , "nfiles" , "nlumis" , "nrel" , "nsites" , "nusers" , "parent" , "primds" , "proc_evts" , "procds" , "rnaccess" , "rnusers" , "rtotcpu" , "size" , "tier" , "totcpu" , "wct", "naccess"]] if a == 'rf': #RANDOM FOREST CLASSIFIER rf = RandomForestClassifier(n_estimators=100) rf = rf.fit(data, target) predictions = rf.predict(week) cal_score("RANDOM FOREST", rf, predictions, test1['target']) if a == "sgd": #SGD CLASSIFIER clf = SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False) clf.fit(data, target) predictions = clf.predict(week) cal_score("SGD Regression",clf, predictions, test1['target']) if a == "nb": clf = GaussianNB() clf.fit(data, target) predictions = clf.predict(week) cal_score("NAIVE BAYES", clf, predictions, test1['target'])
def SGDC_SVM_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS): print("***************Starting SVM***************") t0 = time() clf = SGDClassifier(loss='log', penalty='l2',alpha=1e-5, n_iter=100) clf.fit(X_train, Y_train) preds = clf.predict(X_cv) score = clf.score(X_cv,Y_cv) print("{0:.2f}%".format(100 * score)) Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds), rownames=['actual'], colnames=['preds']) Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100 print(Summary) #Check with log loss function epsilon = 1e-15 #ll_output = log_loss_func(Y_cv, preds, epsilon) preds2 = clf.predict_proba(X_cv) ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True) print(ll_output2) print("done in %0.3fs" % (time() - t0)) preds3 = clf.predict_proba(X_test) #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':])) preds4 = clf.predict_proba(Actual_DS) print("***************Ending SVM***************") return pd.DataFrame(preds2),pd.DataFrame(preds3),pd.DataFrame(preds4)
def plot_sgd_separator(): # we create 50 separable points X, Y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60) # fit the model clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=200, fit_intercept=True) clf.fit(X, Y) # plot the line, the points, and the nearest vectors to the plane xx = np.linspace(-1, 5, 10) yy = np.linspace(-1, 5, 10) X1, X2 = np.meshgrid(xx, yy) Z = np.empty(X1.shape) for (i, j), val in np.ndenumerate(X1): x1 = val x2 = X2[i, j] p = clf.decision_function([x1, x2]) Z[i, j] = p[0] levels = [-1.0, 0.0, 1.0] linestyles = ['dashed', 'solid', 'dashed'] colors = 'k' ax = plt.axes() ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles) ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired) ax.axis('tight')
def validate(): """ Runs a 10-fold cross validation on the classifier, reporting accuracy. """ trainDf = pd.read_csv("../NewData/train.csv") X = np.matrix(pd.DataFrame(trainDf, index=None, columns=["invited", "user_reco", "evt_p_reco", "evt_c_reco", "user_pop", "frnd_infl", "evt_pop"])) y = np.array(trainDf.interested) nrows = len(trainDf) kfold = KFold(nrows, 10) avgAccuracy = 0 run = 0 for train, test in kfold: Xtrain, Xtest, ytrain, ytest = X[train], X[test], y[train], y[test] clf = SGDClassifier(loss="log", penalty="l2") clf.fit(Xtrain, ytrain) accuracy = 0 ntest = len(ytest) for i in range(0, ntest): yt = clf.predict(Xtest[i, :]) if yt == ytest[i]: accuracy += 1 accuracy = accuracy / ntest print "accuracy (run %d): %f" % (run, accuracy) avgAccuracy += accuracy run += 1 print "Average accuracy", (avgAccuracy / run)
def main(): """ Generates features and fits classifier. """ featureIndexes = processData(os.path.join(dataFolder, "avito_train.tsv"), itemsLimit=300000) trainFeatures,trainTargets, trainItemIds=processData(os.path.join(dataFolder,"avito_train.tsv"), featureIndexes, itemsLimit=300000) testFeatures, testItemIds=processData(os.path.join(dataFolder,"avito_test.tsv"), featureIndexes) joblib.dump((trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds), os.path.join(dataFolder,"train_data.pkl")) trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(os.path.join(dataFolder,"train_data.pkl")) logging.info("Feature preparation done, fitting model...") clf = SGDClassifier( loss="log", penalty="l2", alpha=1e-4, class_weight="auto") clf.fit(trainFeatures,trainTargets) logging.info("Predicting...") predicted_scores = clf.predict_proba(testFeatures).T[1] logging.info("Write results...") output_file = "avito_starter_solution.csv" logging.info("Writing submission to %s" % output_file) f = open(os.path.join(dataFolder,output_file), "w") f.write("id\n") for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True): f.write("%d\n" % (item_id)) f.close() logging.info("Done.")
def test_create_model(self): print("labeled sentence worked?") x_train = labelizeReviews(self.xTrain, 'TRAIN') x_test = labelizeReviews(self.xTest, 'TEST') model_dm = gensim.models.Doc2Vec(min_count=1, window=5, size=self.size, sample=1e-3, negative=5, workers=3) model_dbow = gensim.models.Doc2Vec(min_count=1, window=6, size=self.size, sample=1e-3, negative=5, dm=0, workers=3) sentences = x_train model_dm.build_vocab(sentences) model_dbow.build_vocab(sentences) # npArray = np.array(x_train) for epoch in range(10): print("Starting epoch:", str(epoch)) # perm = np.random.permutation(npArray.shape[0]) model_dm.train(random.sample(sentences, len(sentences))) model_dbow.train(random.sample(sentences, len(sentences))) # model_dm.train(x_train) train_vecs = getVecs(model_dm, x_train, self.size) train_vecs_dbow = getVecs(model_dbow, x_train, self.size) train_vecs_total = np.hstack((train_vecs, train_vecs_dbow)) sentences = x_test for epoch in range(10): print("Starting epoch:", str(epoch)) # perm = np.random.permutation(npArray.shape[0]) model_dm.train(random.sample(sentences, len(sentences))) model_dbow.train(random.sample(sentences, len(sentences))) test_vecs = getVecs(model_dm, x_train, self.size) test_vecs_dbow = getVecs(model_dbow, x_train, self.size) test_vecs_total = np.hstack((test_vecs, test_vecs_dbow)) lr = SGDClassifier(loss='log', penalty='l1') lr.fit(train_vecs_total, self.labelsTrain[:self.samples]) print('Test Accuracy: %.2f'%lr.score(test_vecs_total, self.labelsTest[:self.samples]))
def classify(dummy_train,dummy_test,feature_pkl,output_file): # Train classifier, iterating over subsets # Load Features print 'Loading features...' featureIndex, trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(feature_pkl) trainTargets = np.array(trainTargets) testItemIds = np.array(testItemIds) predicted_ids = [] predicted_scores = [] # SGD Logistic Regression per sample clf = SGDClassifier(alpha=3.16227766017e-08, class_weight='auto', epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='log', n_iter=5, n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None, shuffle=False, verbose=0, warm_start=False) for col in range(np.shape(dummy_train)[1]): # Get nonzero dummy indices as array idx_train = dummy_train[:,col].astype('bool').T.toarray()[0] print 'Training subset {} of {}...'.format(col,np.shape(dummy_train)[1]) sub_train = normalize(trainFeatures.tocsr()[idx_train,:], norm='l2', axis=0) clf.fit(sub_train,trainTargets[idx_train]) # Use probabilities instead of binary class prediction in order to generate a ranking idx_test = dummy_test[:,col].astype('bool').T.toarray()[0] sub_test = normalize(testFeatures.tocsr()[idx_test,:], norm='l2', axis=0) predicted_scores += clf.predict_proba(sub_test).T[1].tolist() predicted_ids += testItemIds[idx_test].tolist() with open(os.path.splitext(feature_pkl)[0]+'_'+output_file,'w') as out_fid: out_fid.write("id\n") for pred_score, item_id in sorted(zip(predicted_scores, predicted_ids), reverse = True): # only writes item_id per output spec, but may want to look at predicted_scores out_fid.write("%d\n" % (item_id))
def do_classify(): corpus = MyCorpus() # tfidf_model = TfidfModel(corpus) corpus_idf = tfidf_model[corpus] # corpus_lsi = lsi_model[corpus_idf] num_terms = len(corpus.dictionary) # num_terms = 400 corpus_sparse = matutils.corpus2csc(corpus_idf, num_terms).transpose(copy=False) # print corpus_sparse.shape # corpus_dense = matutils.corpus2dense(corpus_idf, len(corpus.dictionary)) # print corpus_dense.shape penalty = "l2" clf = SGDClassifier(loss="hinge", penalty=penalty, alpha=0.0001, n_iter=50, fit_intercept=True) # clf = LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3) y = np.array(corpus.cls_y) # print y.shape clf.fit(corpus_sparse, y) filename = os.path.join(HERE, "sgdc_clf.pkl") _ = joblib.dump(clf, filename, compress=9) print "train completely" X_test = [] X_label = [] for obj in SogouCorpus.objects.filter(id__in=corpus.test_y): X_test.append(obj.tokens) X_label.append(cls_ids[obj.classify]) # result = classifier.predict(obj.tokens) test_corpus = [dictionary.doc2bow(s.split(",")) for s in X_test] test_corpus = tfidf_model[test_corpus] test_corpus = matutils.corpus2csc(test_corpus, num_terms).transpose(copy=False) pred = clf.predict(test_corpus) score = metrics.f1_score(X_label, pred) print ("f1-score: %0.3f" % score)
def classify_reviews(): import featurizer import gen_training_data import numpy as np from sklearn.naive_bayes import MultinomialNB from sklearn.linear_model import SGDClassifier data = gen_training_data.gen_data(); stemmed_data = featurizer.stem(data); tfidf= featurizer.tfidf(data); clf = MultinomialNB().fit(tfidf['train_tfidf'], data['training_labels']); predicted = clf.predict(tfidf['test_tfidf']); num_wrong = 0; tot = 0; for expected, guessed in zip(data['testing_labels'], predicted): if(expected-guessed != 0): num_wrong += 1; print("num_wrong: %d",num_wrong) sgd_clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42); _ = sgd_clf.fit(tfidf['train_tfidf'], data['training_labels']); sgd_pred = sgd_clf.predict(tfidf['test_tfidf']); print np.mean(sgd_pred == data['testing_labels']); stem_tfidf = featurizer.tfidf(stemmed_data); _ = sgd_clf.fit(stem_tfidf['train_tfidf'], data['training_labels']); sgd_stem_prd = sgd_clf.predict(stem_tfidf['test_tfidf']); print np.mean(sgd_stem_prd==data['testing_labels']);
def SGD(x, y): #Using Stochastic Gradient Descent of Sklearn from sklearn.linear_model import SGDClassifier clf = SGDClassifier() clf.fit(x, y) return clf.predict(x)
def buildModel(size): with open('Sentiment Analysis Dataset.csv', 'rb') as csvfile: pos_tweets =[] neg_tweets =[] spamreader = csv.reader(csvfile, delimiter=',') for row in spamreader: if row[1] == '1': if not (len(pos_tweets) > size): pos_tweets.append(_cleanTweet(row[3])) else: if not (len(neg_tweets) > size): neg_tweets.append(_cleanTweet(row[3])) y = np.concatenate((np.ones(len(pos_tweets[0:size])), np.zeros(len(neg_tweets[0:size])))) x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_tweets[0:size], neg_tweets[0:size])), y, test_size=0.2) x_train = _cleanText(x_train) x_test = _cleanText(x_test) n_dim = 100 #Initialize model and build vocab imdb_w2v = Word2Vec(size=n_dim, min_count=10) imdb_w2v.build_vocab(x_train) imdb_w2v.train(x_train) train_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_train]) train_vecs = scale(train_vecs) #Train word2vec on test tweets imdb_w2v.train(x_test) #Build test tweet vectors then scale test_vecs = np.concatenate([buildWordVector(z, n_dim,imdb_w2v) for z in x_test]) test_vecs = scale(test_vecs) lr = SGDClassifier(loss='log', penalty='l1') lr.fit(train_vecs, y_train) imdb_w2v.save("imdb_w2v") f = open("Accuracy.txt","w") f.write(str(lr.score(test_vecs, y_test))+" "+str(size*2)) f.close()
def main(date): """ Runs linear regression (classification) between the herbicide resistance classes based on all wavelengths. The weights associated with each wavelength are then plotted, allowing the user to see the contribution to classification by each wavelength. :param date: (string) Data collection date YYYY_MMDD :return: (None) """ # Load the training data from disk X, y = FileIO.loadTrainingData(date) X = np.nan_to_num(X) # Train the classifier on the loaded data clf = SGDClassifier() clf.fit(X, y) # Plot the feature weights to visualize feature contributions featureWeights = np.fabs(clf.coef_) for i in xrange(3): plt.plot(WAVELENGTHS, featureWeights[i]) plt.title("Linear Classifier Weights for " + RESISTANCE_STRINGS[INDEX_TO_LABEL[i]] + " vs Others") plt.xlabel("Wavelength (nm)") plt.ylabel("Absolute Weight") plt.show()
class twoclass(SGDClassifier): # THE HACK IS NOW GETTING EVEN MORE EVIL def __init__(self): self.clazz= SGDClassifier(loss='log') def fit(self,X,y, crossval=False): if crossval: print "layers crossvalscore:",sklearn.model_selection.cross_val_score(SGDClassifier(loss='log'),X, y).mean() self.clazz.fit(X,y) self.intercept_= self.clazz.intercept_ self.classes_= self.clazz.classes_ return self # eden cant annotate two classes if the esti is not a sgdregressor # -> this hack is made! ''' details: decission function returns a one d array. eden only accepts these if the estimater is instance of sgdregressor. so i make a two d array from my 1 d array. if i hack something like this in the future maybe the intercept array needs to be provided.. (see the annotator code) ''' # default guy: #def decision_function(self, vector): # answer = super(self.__class__,self).decision_function(vector) # return np.vstack((answer, (answer-1))).T def decision_function(self,vector): return self.clazz.predict_proba(vector) '''
def runSGDPipeline(entries, langs): t0 = time() sgd_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=n_features)), ('tfidf', TfidfTransformer(use_idf=True)), ('clf', SGDClassifier(loss='squared_hinge', penalty='l2', alpha=0.001, n_iter=5, random_state=42))]) vect = CountVectorizer(ngram_range=(1,1), max_features=n_features) X_train_counts = vect.fit_transform(entries) tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tfidf = tfidf.fit_transform(X_train_counts) clf = SGDClassifier(loss='squared_hinge', penalty='l2', alpha=0.001, n_iter=5, random_state=42) clf.fit(X_train_tfidf, langs) X_new_counts = vect.transform(entries) X_new_tfidf = tfidf.transform(X_new_counts) predicted = clf.predict(X_new_tfidf.toarray()) print(np.mean(predicted == langs)) print(metrics.classification_report(langs, predicted, target_names=langs)) print(metrics.confusion_matrix(langs, predicted)) print("Took %s seconds." % (time()-t0)) print("n_samples: %d, n_features: %d" % X_train_tfidf.shape) return sgd_pipeline
def sgc_test(X, y, weight): from sklearn.linear_model import SGDClassifier from sklearn import cross_validation from sklearn.metrics import confusion_matrix from sklearn.preprocessing import StandardScaler for i in range(0,1): X_train, X_test, y_train, y_test, weight_train, weight_test = cross_validation.train_test_split( X, y, weight, test_size=0.2, random_state=0) clf = SGDClassifier(loss="hinge", n_iter=100, n_jobs=-1, penalty="l2") #clf = LogisticRegression( max_iter=100) scaler = StandardScaler(with_mean=False) scaler.fit(X_train) # Don't cheat - fit only on training data X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) # apply same transformation to test data clf.fit(X_train, y_train, sample_weight=weight_train) y_pred = clf.predict(X_train) #print(confusion_matrix(y_train, y_pred)) print(clf.score(X_train,y_train,weight_train)) y_pred = clf.predict(X_test) #print(confusion_matrix(y_test, y_pred)) print(clf.score(X_test,y_test,weight_test))
def crossvalidate(feas, labels, param): labels = np.array(list(labels), dtype=int) accs = [] for train_ids, valid_ids in StratifiedKFold(labels, 10): idf=train_idf([feas[i] for i in train_ids]) X,vocab=extract_feas(feas, idf) #lda=train_lda(X, vocab, num_topics) #X=transform_lda(X, lda) labels_train = labels[train_ids].copy() weights = balance_weights(labels_train, param['bg_weight']) labels_train[labels_train == 0] = 1 model=SGDClassifier(loss='log', alpha=param['regu']/len(labels_train), fit_intercept=True, shuffle=True, n_iter=50) model.fit(X[train_ids], labels_train, sample_weight=weights) pp = model.predict_proba(X[valid_ids]) pred_labels = np.argmax(pp, 1) pred_labels = model.classes_[pred_labels] #a=accuracy(labels[valid_ids], pred_labels, 1) # return all scores for "good" class assert model.classes_[1] == 2 pred_scores = pp[:,1] a=avg_precision(labels[valid_ids], pred_scores) print '%.2f' % a, accs.append(a) return np.mean(accs)
def train_vectorized(feats, Y, model_path=None, grid=False): # Vectorize labels labels = [ labels_map[y] for y in Y ] Y = np.array( labels ) # Vectorize feature dictionary vec = DictVectorizer() X = vec.fit_transform(feats) norm_mat( X , axis=0 , copy=False) # Grid Search if grid: print 'Performing Grid Search' clf = do_grid_search(X, Y) else: #clf = LinearSVC(C=0.1, class_weight='auto') #clf = LogisticRegression(C=0.1, class_weight='auto') clf = SGDClassifier(penalty='elasticnet',alpha=0.001, l1_ratio=0.85, n_iter=1000,class_weight='auto') clf.fit(X, Y) # Save model if model_path: with open(model_path+'.dict' , 'wb') as f: pickle.dump(vec, f) with open(model_path+'.model', 'wb') as f: pickle.dump(clf, f) # return model return vec, clf
class kernelsvm(): def __init__(self, theta0, alpha, loss_metric): self.theta0 = theta0 self.alpha = alpha self.loss_metric = loss_metric def fit(self, X, y, idx_SR): n_SR = len(idx_SR) self.feature_map_nystroem = General_Nystroem(kernel='rbf', gamma=self.theta0, n_components=n_SR) X_features = self.feature_map_nystroem.fit_transform(X,idx_SR) print("fitting SGD") self.clf = SGDClassifier(loss=self.loss_metric,alpha=self.alpha) self.clf.fit(X_features, y) print("fitting SGD finished") def predict(self, X): print("Predicting") X_transform = self.feature_map_nystroem.transform(X) return self.clf.predict(X_transform), X_transform def decision_function(self, X): # X should be the transformed input! return self.clf.decision_function(X) def err_rate(self, y_true, y_pred): acc = accuracy_score(y_true, y_pred) err_rate = 1.0-acc return err_rate def get_params(self): return self.clf.get_params()
def train(docs, labels, regu=1, bg_weight=.1): ''' :param docs: iterator of (title, body) pairs :param labels: integer labels for docs (0 is weakly-negative) :return: model ''' num_topics=50 feas = map(extract_words, docs) labels = np.array(list(labels), dtype=int) idf=train_idf(feas) X,vocab=extract_feas(feas, idf) #lda=train_lda(X, vocab, num_topics) #X=transform_lda(X, lda) # set up sample weights weights = balance_weights(labels, bg_weight) labels=labels.copy() labels[labels == 0] = 1 model=SGDClassifier(loss='log', alpha=regu/len(labels), fit_intercept=True, n_iter=100, shuffle=True) model.fit(X, labels, sample_weight=weights) #print accuracy(labels, model.predict(X)) return dict(idf=idf, logreg=model, lda=None)
def stochasticGD(input_file,Output,test_size): lvltrace.lvltrace("LVLEntree dans stochasticGD split_test") ncol=tools.file_col_coma(input_file) data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1)) X = data[:,1:] y = data[:,0] n_samples, n_features = X.shape X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) print X_train.shape, X_test.shape clf = SGDClassifier(loss="hinge", penalty="l2") clf.fit(X_train,y_train) y_pred = clf.predict(X_test) print "Stochastic Gradient Descent " print "classification accuracy:", metrics.accuracy_score(y_test, y_pred) print "precision:", metrics.precision_score(y_test, y_pred) print "recall:", metrics.recall_score(y_test, y_pred) print "f1 score:", metrics.f1_score(y_test, y_pred) print "\n" results = Output+"Stochastic_GD_metrics_test.txt" file = open(results, "w") file.write("Stochastic Gradient Descent estimator accuracy\n") file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred)) file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred)) file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred)) file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred)) file.write("\n") file.write("True Value, Predicted Value, Iteration\n") for n in xrange(len(y_test)): file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1))) file.close() title = "Stochastic Gradient Descent %f"%test_size save = Output + "Stochastic_GD_confusion_matrix"+"_%s.png"%test_size plot_confusion_matrix(y_test, y_pred,title,save)
def predict_sgd(X_train, y_train, X_test, sample_weight): clf = SGDClassifier(loss='log', alpha=0.01, l1_ratio=0, n_jobs=2, n_iter=50) clf.fit(X_train, y_train, sample_weight=sample_weight) predictions = clf.predict_proba(X_test) return predictions
def train_and_predict_m3 (train, test, labels) : ## Apply basic concatenation + stemming trainData, testData = stemmer_clean (train, test, stemmerEnableM3, stemmer_type = 'porter') """ # Beautiful soup cleanup and stemming stemmer = PorterStemmer() trainData = modified_cleanup(train, stemmer, is_train = True) testData = modified_cleanup(test, stemmer, is_train = False) """ ## TF-IDF transform with sub-linear TF and stop-word removal tfv = TfidfVectorizer(min_df = 3, max_features = None, strip_accents = 'unicode', analyzer = 'word', token_pattern = r'\w{1,}', ngram_range = (1, 6), smooth_idf = 1, sublinear_tf = 1, stop_words = ML_STOP_WORDS) tfv.fit(trainData) X = tfv.transform(trainData) X_test = tfv.transform(testData) ## Create the classifier clf = SGDClassifier(random_state = randomState, n_jobs = 1, penalty = 'l2', loss = 'huber', n_iter = 50, class_weight = 'auto', learning_rate = 'optimal', epsilon = 1) ## Create a parameter grid to search for best parameters for everything in the pipeline param_grid = {'n_iter' : [30, 50, 80, 100, 200], 'loss': ['huber'], 'epsilon' : [0.3, 1], 'alpha' : [0.0001, 0.0003, 0.001] } ## Predict model with best parameters optimized for quadratic_weighted_kappa if (gridSearch) : model = perform_grid_search (clf, param_grid, X, labels) pred = model.predict(X_test) else : clf.fit(X, labels) pred = clf.predict(X_test) return pred
def train_kaggle(dataset, alg="rig", data="bow"): train_x, train_y, test_x = dataset print "shape for training data is", train_x.shape if alg == "svm": clf = SGDClassifier(verbose=1, n_jobs=2, n_iter=20) elif alg == "svm_sq": clf = SGDClassifier(verbose=1, n_jobs=2, n_iter=20, loss="squared_hinge") elif alg == "log": clf = LogisticRegression(verbose=1, n_jobs=2) elif alg == "per": clf = Perceptron(verbose=1, n_jobs=2, n_iter=25) elif alg == "rig": clf = RidgeClassifier() elif alg == "pa": clf = PassiveAggressiveClassifier(n_jobs=2, n_iter=25) else: raise NotImplementedError print "training with %s..." % alg clf.fit(train_x, train_y) # clf.fit(validate_x, validate_y) predicted = clf.predict(test_x) save_csv(predicted, fname=alg + "_" + data) if alg != "nb": return clf.decision_function(train_x), clf.decision_function(test_x) else: return clf.predict_proba(train_x), clf.predict_proba(test_x)
class SGD(object): def __init__(self): self.sgd = SGDClassifier(loss='modified_huber', alpha = .00001, penalty='elasticnet',shuffle=True, n_jobs=-1,random_state = 2014) def predict(self, X): return self.sgd.predict_proba(X)[:,1][:,np.newaxis] def fit(self, X, y): self.sgd.fit(X,y)
def scikit_GDS(x,y, X_test,y_test=None, prevmodel="yes", output=False): from sklearn.linear_model import SGDClassifier from sklearn.externals import joblib clf = SGDClassifier(loss="hinge", penalty="l2") ## if prevmodel !="yes": clf.fit(X, y) joblib.dump(clf, 'trained_GDS_model.pkl') else: clf =joblib.load('trained_GDS_model.pkl') if output == False: predictions = clf.predict(X_test) correctcount = 0 totalcount = 0 for index, each in enumerate(predictions): if y_test[index] == each: correctcount +=1 totalcount+=1 print str(correctcount) +" / " + str(totalcount) +" = " + str(float(correctcount)/totalcount) else: predictions = clf.predict(X_test) return predictions
def sgd_classifier(V_train, y_train, V_val, y_val, V_test, y_test): t0 = time.time() print 'Building Random Forest model' clf = SGDClassifier(n_iter = 50) #clf = grid_search.GridSearchCV(svm_clf, parameters) clf.fit(V_train, y_train) #print clf.best_params_ t1 = time.time() print 'Building Random Forest model ... Done', str(int((t1 - t0)*100)/100.) print '' p_val =clf.predict(V_val) print 'Training accuracy on validation set', accuracy_score(y_val, p_val) p_test = clf.predict(V_test) print 'Accuracy on testing set' print classification_report(y_test, p_test)
def main(feature_pkl): print 'Loading data...' featureIndex, trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(feature_pkl) print 'Normalizing data...' trainFeatures = sklearn.preprocessing.normalize(trainFeatures.tocsc(), norm='l2', axis=0) testFeatures = sklearn.preprocessing.normalize(testFeatures.tocsc(), norm='l2', axis=0) #trainSplit, testSplit = splitTuple # Best estimator from grid search: clf = SGDClassifier(alpha=3.16227766017e-08, class_weight='auto', epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='log', n_iter=5, n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None, shuffle=False, verbose=0, warm_start=False) print 'Fitting model...' clf.fit(trainFeatures,trainTargets) # Use probabilities or decision function to generate a ranking predicted_scores = clf.decision_function(testFeatures) with open(os.path.splitext(feature_pkl)[0]+'_testRanking.csv', 'w') as f: f.write('id\n') for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True): f.write('%d\n' % (item_id)) # Turn estimator params into word clouds features, indices = zip(*sorted(featureIndex.iteritems(), key=operator.itemgetter(1))) coef_tuple = zip(clf.coef_[0],indices) coef_sort = sorted(coef_tuple, reverse=True) print 'Top 20 for illicit:' wordle_print(coef_sort[:20],features) print 'Top 20 for licit:' wordle_print(coef_sort[-20:],features)
def plot_sgd_classifier(num_samples, clt_std): #generation of data X, y = make_blobs(n_samples=num_samples, centers=2, cluster_std=clt_std) #fitting of data using logistic regression clf = SGDClassifier(loss='log', alpha=0.01) clf.fit(X, y) #plotting of data x_ = np.linspace(min(X[:, 0]), max(X[:, 0]), 10) y_ = np.linspace(min(X[:, 1]), max(X[:, 1]), 10) X_, Y_ = np.meshgrid(x_, y_) Z = np.empty(X_.shape) for (i, j), val in np.ndenumerate(X_): x1 = val x2 = Y_[i, j] conf_score = clf.decision_function([x1, x2]) Z[i, j] = conf_score[0] levels = [-1.0, 0, 1.0] colors = 'k' linestyles = ['dashed', 'solid', 'dashed'] ax = plt.axes() plt.xlabel('X1') plt.ylabel('X2') ax.contour(X_, Y_, Z, colors=colors, levels=levels, linestyles=linestyles, labels='Boundary') ax.scatter(X[:, 0], X[:, 1], c=y)
with open('ghost_train.csv', 'Ur') as f: data = list(tuple(rec) for rec in csv.reader(f, delimiter=' ')) ghostarray = np.array(data) ghostarray = ghostarray.astype(np.float) #print type(ghostarray[2][2]) X = ghostarray[:, [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]] Y = np.ravel(ghostarray[:, [1]]) #print X print Y #fit classification model clf = SGDClassifier(loss="hinge", penalty="l1") clf.fit(X, Y) SGDClassifier(alpha=0.00001, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=100, n_jobs=100, penalty='l1', power_t=0.5, random_state=False, rho=None, shuffle=False,
from sklearn.metrics import classification_report from sklearn import metrics import nltk # Train the classifier from sklearn.linear_model import LogisticRegression, SGDClassifier from sklearn.svm import LinearSVC SGDC = SGDClassifier() LSVC = LinearSVC() LSVC.fit(X_train, y_train_sent) accuracy_score_lsvc = metrics.accuracy_score(LSVC.predict(X_test), y_test_sent) print('accuracy_score_lsvc = ' + str('{:4.2f}'.format(accuracy_score_lsvc * 100)) + '%') SGDC.fit(X_train, y_train_sent) accuracy_score_sgdc = metrics.accuracy_score(SGDC.predict(X_test), y_test_sent) print('accuracy_score_sgdc = ' + str('{:4.2f}'.format(accuracy_score_sgdc * 100)) + '%') import matplotlib.pyplot as plt import datetime dates = y_train['date'].unique() converted_dates = list( map(datetime.datetime.strptime, dates, len(dates) * ['%Y-%m-%d'])) #y_axis = y_train['Confirmed'] #state = 'Connecticut' #stateX = X_train.toarray()[(y_train['location']==state)] #& (y_train['date']< '2020-04-19')] stateX = X_train.toarray() #ylist = classifier_linear.predict(stateX)
class LogRegIntentClassifier(IntentClassifier): """Intent classifier which uses a Logistic Regression underneath""" config_type = LogRegIntentClassifierConfig def __init__(self, config=None, **shared): """The LogReg intent classifier can be configured by passing a :class:`.LogRegIntentClassifierConfig`""" super(LogRegIntentClassifier, self).__init__(config, **shared) self.classifier = None self.intent_list = None self.featurizer = None @property def fitted(self): """Whether or not the intent classifier has already been fitted""" return self.intent_list is not None @log_elapsed_time(logger, logging.DEBUG, "LogRegIntentClassifier in {elapsed_time}") def fit(self, dataset): """Fits the intent classifier with a valid Snips dataset Returns: :class:`LogRegIntentClassifier`: The same instance, trained """ logger.debug("Fitting LogRegIntentClassifier...") dataset = validate_and_format_dataset(dataset) self.load_resources_if_needed(dataset[LANGUAGE]) self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) language = dataset[LANGUAGE] random_state = check_random_state(self.config.random_seed) data_augmentation_config = self.config.data_augmentation_config utterances, classes, intent_list = build_training_data( dataset, language, data_augmentation_config, self.resources, random_state) self.intent_list = intent_list if len(self.intent_list) <= 1: return self self.featurizer = Featurizer( config=self.config.featurizer_config, builtin_entity_parser=self.builtin_entity_parser, custom_entity_parser=self.custom_entity_parser, resources=self.resources) self.featurizer.language = language none_class = max(classes) try: x = self.featurizer.fit_transform(dataset, utterances, classes, none_class) except _EmptyDatasetUtterancesError: self.featurizer = None return self alpha = get_regularization_factor(dataset) self.classifier = SGDClassifier(random_state=random_state, alpha=alpha, **LOG_REG_ARGS) self.classifier.fit(x, classes) logger.debug("%s", DifferedLoggingMessage(self.log_best_features)) return self @fitted_required def get_intent(self, text, intents_filter=None): """Performs intent classification on the provided *text* Args: text (str): Input intents_filter (str or list of str): When defined, it will find the most likely intent among the list, otherwise it will use the whole list of intents defined in the dataset Returns: dict or None: The most likely intent along with its probability or *None* if no intent was found Raises: :class:`snips_nlu.exceptions.NotTrained`: When the intent classifier is not fitted """ return self._get_intents(text, intents_filter)[0] @fitted_required def get_intents(self, text): """Performs intent classification on the provided *text* and returns the list of intents ordered by decreasing probability The length of the returned list is exactly the number of intents in the dataset + 1 for the None intent Raises: :class:`snips_nlu.exceptions.NotTrained`: when the intent classifier is not fitted """ return self._get_intents(text, intents_filter=None) def _get_intents(self, text, intents_filter): if isinstance(intents_filter, str): intents_filter = {intents_filter} elif isinstance(intents_filter, list): intents_filter = set(intents_filter) if not text or not self.intent_list or not self.featurizer: results = [intent_classification_result(None, 1.0)] results += [ intent_classification_result(i, 0.0) for i in self.intent_list if i is not None ] return results if len(self.intent_list) == 1: return [intent_classification_result(self.intent_list[0], 1.0)] # pylint: disable=C0103 X = self.featurizer.transform([text_to_utterance(text)]) # pylint: enable=C0103 proba_vec = self._predict_proba(X) logger.debug( "%s", DifferedLoggingMessage(self.log_activation_weights, text, X)) results = [ intent_classification_result(i, proba) for i, proba in zip(self.intent_list, proba_vec[0]) if intents_filter is None or i is None or i in intents_filter ] return sorted(results, key=lambda res: -res[RES_PROBA]) def _predict_proba(self, X): # pylint: disable=C0103 self.classifier._check_proba() # pylint: disable=W0212 prob = self.classifier.decision_function(X) prob *= -1 np.exp(prob, prob) prob += 1 np.reciprocal(prob, prob) if prob.ndim == 1: return np.vstack([1 - prob, prob]).T return prob @check_persisted_path def persist(self, path): """Persists the object at the given path""" path.mkdir() featurizer = None if self.featurizer is not None: featurizer = "featurizer" featurizer_path = path / featurizer self.featurizer.persist(featurizer_path) coeffs = None intercept = None t_ = None if self.classifier is not None: coeffs = self.classifier.coef_.tolist() intercept = self.classifier.intercept_.tolist() t_ = self.classifier.t_ self_as_dict = { "config": self.config.to_dict(), "coeffs": coeffs, "intercept": intercept, "t_": t_, "intent_list": self.intent_list, "featurizer": featurizer } classifier_json = json_string(self_as_dict) with (path / "intent_classifier.json").open(mode="w") as f: f.write(classifier_json) self.persist_metadata(path) @classmethod def from_path(cls, path, **shared): """Loads a :class:`LogRegIntentClassifier` instance from a path The data at the given path must have been generated using :func:`~LogRegIntentClassifier.persist` """ path = Path(path) model_path = path / "intent_classifier.json" if not model_path.exists(): raise LoadingError("Missing intent classifier model file: %s" % model_path.name) with model_path.open(encoding="utf8") as f: model_dict = json.load(f) # Create the classifier config = LogRegIntentClassifierConfig.from_dict(model_dict["config"]) intent_classifier = cls(config=config, **shared) intent_classifier.intent_list = model_dict['intent_list'] # Create the underlying SGD classifier sgd_classifier = None coeffs = model_dict['coeffs'] intercept = model_dict['intercept'] t_ = model_dict["t_"] if coeffs is not None and intercept is not None: sgd_classifier = SGDClassifier(**LOG_REG_ARGS) sgd_classifier.coef_ = np.array(coeffs) sgd_classifier.intercept_ = np.array(intercept) sgd_classifier.t_ = t_ intent_classifier.classifier = sgd_classifier # Add the featurizer featurizer = model_dict['featurizer'] if featurizer is not None: featurizer_path = path / featurizer intent_classifier.featurizer = Featurizer.from_path( featurizer_path, **shared) return intent_classifier def log_best_features(self, top_n=50): if not hasattr(self.featurizer, "feature_index_to_feature_name"): return None log = "Top {} features weights by intent:".format(top_n) index_to_feature = self.featurizer.feature_index_to_feature_name for intent_ix in range(self.classifier.coef_.shape[0]): intent_name = self.intent_list[intent_ix] log += "\n\n\nFor intent {}\n".format(intent_name) top_features_idx = np.argsort( np.absolute(self.classifier.coef_[intent_ix]))[::-1][:top_n] for feature_ix in top_features_idx: feature_name = index_to_feature[feature_ix] feature_weight = self.classifier.coef_[intent_ix, feature_ix] log += "\n{} -> {}".format(feature_name, feature_weight) return log def log_activation_weights(self, text, x, top_n=50): if not hasattr(self.featurizer, "feature_index_to_feature_name"): return None log = "\n\nTop {} feature activations for: \"{}\":\n".format( top_n, text) activations = np.multiply(self.classifier.coef_, np.asarray(x.todense())) abs_activation = np.absolute(activations).flatten().squeeze() if top_n > activations.size: top_n = activations.size top_n_activations_ix = np.argpartition(abs_activation, -top_n, axis=None)[-top_n:] top_n_activations_ix = np.unravel_index(top_n_activations_ix, activations.shape) index_to_feature = self.featurizer.feature_index_to_feature_name features_intent_and_activation = [ (self.intent_list[i], index_to_feature[f], activations[i, f]) for i, f in zip(*top_n_activations_ix) ] features_intent_and_activation = sorted(features_intent_and_activation, key=lambda x: abs(x[2]), reverse=True) for intent, feature, activation in features_intent_and_activation: log += "\n\n\"{}\" -> ({}, {:.2f})".format(intent, feature, float(activation)) log += "\n\n" return log
train_a=OHE.transform(train_tree_index[:,index].reshape(-1,1)) valid_a=OHE.transform(valid_tree_index[:,index].reshape(-1,1)) if index %50==0: print index print 'train_a ----> one hot--->shape',train_a.shape data_x_train_2=sparse.hstack((data_x_train_2,train_a)) data_x_valid_2=sparse.hstack((data_x_valid_2,valid_a)) df_feature_map.to_csv(save_path+"feature_important_mapping_cut_corr.csv") sparse.save_npz(save_path+"data_x_train_cut_corr.npz",data_x_train) sparse.save_npz(save_path+"data_x_valid_cut_corr.npz",data_x_valid) eval_list=[(data_x_train,y_train),(data_x_valid,y_valid)] gbm_clf = lgb.LGBMClassifier( boosting_type='gbdt', num_leaves=31, reg_alpha=0.0, reg_lambda=1, max_depth=-1, n_estimators=1000, objective='binary', subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.02, min_child_weight=50,random_state=20180506,n_jobs=7) gbm_clf.fit(data_x_train,y_train,eval_set=eval_list,eval_metric ='auc',early_stopping_rounds =100) joblib.dump(gbm_clf, '/home/heqt/jupyter_project/model/gbm_clf_cnt_80W_corr.pkl') SGDLR_clf=SGDClassifier(loss='log', penalty='l1', alpha=1.0, l1_ratio=0.15, random_state=20150511,learning_rate='optimal',n_jobs=15) SGDLR_clf.fit(data_x_train_LR,y_train) joblib.dump(gbm_clf, '/home/heqt/jupyter_project/model/feature_SGDLR_clf.pkl')
from fully_connected.utils import treshhold_labels, normalize_data, load_monolithic if __name__ == '__main__': train, dev, test = load_monolithic('data_monolithic_mfcc.pkl') X_train, S_train, Y_train = train X_dev, S_dev, Y_dev = dev X_test, S_test, Y_test = test X_train, X_dev, X_test = normalize_data(X_train, X_dev, X_test) Y_train, Y_dev, Y_test = treshhold_labels(Y_train, Y_dev, Y_test, .25) # rbf_feature = RBFSampler(gamma=1, n_components=800, random_state=1) rbf_feature = Nystroem(gamma=1, n_components=200, random_state=1) print('transform features') X_train_features = rbf_feature.fit_transform(X_train) X_dev_features = rbf_feature.transform(X_dev) print('finish') clf = SGDClassifier(max_iter=400, loss='log', n_jobs=-1, random_state=1, alpha=0.00000001, tol=1e-9, early_stopping=False, verbose=1, n_iter_no_change=40) clf.fit(X_train_features, Y_train) print('=== Training Set Performance ===') print(clf.score(X_train_features, Y_train)) print(confusion_matrix(Y_train, clf.predict(X_train_features))) print(roc_auc_score(Y_train, clf.predict_proba(X_train_features)[:, 1])) print('=== Dev Set Performance ===') print(clf.score(X_dev_features, Y_dev)) print(confusion_matrix(Y_dev, clf.predict(X_dev_features))) print(roc_auc_score(Y_dev, clf.predict_proba(X_dev_features)[:, 1]))
# In[ ]: linear_svc = LinearSVC() linear_svc.fit(X_train, Y_train) Y_pred = linear_svc.predict(X_test) acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2) acc_linear_svc # In[ ]: sgd = SGDClassifier() sgd.fit(X_train, Y_train) Y_pred = sgd.predict(X_test) acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2) acc_sgd # In[ ]: decision_tree = DecisionTreeClassifier() decision_tree.fit(X_train, Y_train) Y_pred = decision_tree.predict(X_test) acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2) acc_decision_tree
print("Accuracy Score: ", '{:,.3f}'.format(float(accuracy_score(test, pred)) * 100), "%") print(" Precision: ", '{:,.3f}'.format(float(precision_score(test, pred, average='macro')) * 100), "%") print(" Recall: ", '{:,.3f}'.format(float(recall_score(test, pred, average='macro')) * 100), "%") print(" F1 score: ", '{:,.3f}'.format(float(f1_score(test, pred, average='macro')) * 100), "%") #Let's split the data into train/test sets from sklearn.model_selection import train_test_split # test set size of 20% of the data and the random seed 42 <3 X_train, X_test, y_train, y_test = train_test_split(X_beng.toarray(),y_pred, test_size=0.2, random_state=42) print("X_train size:", len(X_train)) print("X_test size:", len(X_test), "\n") from sklearn.model_selection import cross_val_score from sklearn.model_selection import cross_val_predict from sklearn.linear_model import SGDClassifier # SGD instance sgd_clf = SGDClassifier(max_iter=10000, tol=1e-3, random_state=42, n_jobs=4) # train SGD sgd_clf.fit(X_train, y_train) # cross validation predictions sgd_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3, n_jobs=4) # print out the classification report classification_report("Stochastic Gradient Descent Report (Training Set)", y_train, sgd_pred)
print y_train.value_counts(), '\n', y_test.value_counts(), from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.linear_model import SGDClassifier ss = StandardScaler() X_train = ss.fit_transform(X_train) X_test = ss.transform(X_test) lr = LogisticRegression() sgdc = SGDClassifier() lr.fit(X_train, y_train) lr_y_predict = lr.predict(X_test) sgdc.fit(X_train, y_train) sgdc_y_predict = sgdc.predict(X_test) # print lr_y_predict,'\n',sgdc_y_predict from sklearn.metrics import classification_report print('Accuracy of LR Classifier:', lr.score(X_test, y_test)) print( classification_report(y_test, lr_y_predict, target_names=['Benign', 'Malignant'])) print('Accuarcy of SGD Classifier:', sgdc.score(X_test, y_test)) print( classification_report(y_test, sgdc_y_predict,
for i in range(iterations): X, y = shuffle(data, target) grd_lm = LogisticRegression() sgd = SGDClassifier(alpha=0.01, n_iter=100) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) # It is important to train the ensemble of trees on a different subset # of the training data than the linear regression model to avoid # overfitting, in particular if the total number of leaves is # similar to the number of training samples X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size=0.5) mod_sgd = sgd.fit(X=X_train, y=y_train) #logistic regression mod_lr = grd_lm.fit(X=X_train, y=y_train) #embed features in space with gbt grd = GradientBoostingClassifier(n_estimators=10) grd.fit(X_train, y_train) #Hot encoding of the resulting leaves for each sample grd_enc = OneHotEncoder() grd_enc.fit(grd.apply(X_train)[:, :, 0]) #create new dataset consisting of old features and the hot encoded gbt result X_train_enr_lr = np.hstack( (X_train_lr, grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]).toarray()))
test_tags = y[cutoff:] print "Training set size : " + str(training_sentences.shape[0]) print "Testing set size : " + str(test_sentences.shape[0]) # default SGD clf = SGDClassifier(loss='log') print "Algorithm", configuration["algorithm"] if (configuration["algorithm"] == "SGD"): clf = SGDClassifier(loss='log') elif (configuration["algorithm"] == "SVM"): clf = svm.SVC(decision_function_shape='ovo') elif (configuration["algorithm"] == "MLP"): epoch = 200 clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(17, 17, 17, 17), random_state=1, max_iter=epoch) print 'Training started' clf.fit(training_sentences, training_tags) print 'Training completed' print "Testing started" score = clf.score(test_sentences, test_tags) print "F1 Score" print f1_score(test_tags, clf.predict(test_sentences), average='weighted') print "Accuracy:", score
sc = StandardScaler() ## Scailing the training set X_train = sc.fit_transform(X_train) ## Scailing the test set. X_test = sc.transform(X_test) sgd_lr = SGDClassifier(loss="log", penalty='l1', alpha=0.0001, fit_intercept=True, max_iter=100, learning_rate='constant', eta0=0.01) sgd_lr.fit(X=X_train, y=y_train) # The purpose of logistic regression model is to find the probability of the testing example # belonging to a certain class label. probability = sgd_lr.predict_proba(X=X_test) print(probability) # The probability of each training example belonging to the class1, class2, and class3 # will sum up to 1. sum_up = sgd_lr.predict_proba(X=X_test).sum(axis=1) print(sum_up) # Identifying labels predicted by the model. class_predicted = sgd_lr.predict(X=X_test) print(class_predicted)
vectorizer = CountVectorizer( stop_words='english' ) # setting stop-words, so words like "the" and "it" are ignored X = vectorizer.fit_transform(news['TITLE']) # convert TITLE samples to vectors y = news['CATEGORY'] # label X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 30% split # model (best params established through gridsearch in notebook) sgd = SGDClassifier(n_jobs=-1, n_iter=10, alpha=1e-05, loss='hinge', random_state=1234) sgd.fit(X_train, y_train) # custom function that inputs a news title, and outputs one of 4 specified categories def title_to_category(title): categories = { 'b': 'business', 't': 'science and technology', 'e': 'entertainment', 'm': 'health' } pridicter = sgd.predict(vectorizer.transform([title])) return categories[pridicter[0]] # testing a headline from The Onion
def SGDClasify(): trainPath = sys.argv[1] testPath = sys.argv[2] files_list_train = pm.files(trainPath) files_list_test = pm.files(testPath) list_dict_train, words_list_train = pm.text_processing(files_list_train) bagOfWords_train, bernoulli_train = pm.generate_models( list_dict_train, words_list_train) list_dict_test, words_list_test = pm.text_processing(files_list_test) bagOfWords_test, bernoulli_test = pm.generate_models( list_dict_test, words_list_train) frame_train_BOW = pd.DataFrame(bagOfWords_train[1:]) frame_test_BOW = pd.DataFrame(bagOfWords_test[1:]) X_train_BOW = frame_train_BOW.iloc[:, :-1] y_train_BOW = frame_train_BOW.iloc[:, -1] X_test_BOW = frame_test_BOW.iloc[:, :-1] y_test_BOW = frame_test_BOW.iloc[:, -1] frame_train_Ber = pd.DataFrame(bernoulli_train[1:]) frame_test_Ber = pd.DataFrame(bernoulli_test[1:]) X_train_Ber = frame_train_Ber.iloc[:, :-1] y_train_Ber = frame_train_Ber.iloc[:, -1] X_test_Ber = frame_test_Ber.iloc[:, :-1] y_test_Ber = frame_test_Ber.iloc[:, -1] #sklearn.linear_model.SGDClassifier(loss='hinge’, penalty=’l2’, alpha=0.0001,l1_ratio=0.15, fit_intercept=True, # max_iter=None,tol=None, shuffle=True, verbose=0, epsilon=0.1,n_jobs=None, # random_state=None, learning_rate='optimal’, eta0=0.0, power_t=0.5, # early_stopping=False, validation_fraction=0.1,n_iter_no_change=5, # class_weight=None,warm_start=False, average=False, n_iter=None) SGDClassifierModelBOW = SGDClassifier(max_iter=1500) print(SGDClassifierModelBOW) #print(SGDClassifierModelBOW) SGDClassifierModelBOW.fit(X_train_BOW, y_train_BOW) #Calculating Prediction y_pred_BOW = SGDClassifierModelBOW.predict(X_test_BOW) accuracyBOW = accuracy_score(y_test_BOW, y_pred_BOW) print('The Accuracy with BOW :', accuracyBOW * 100) SGDClassifierModelBer = SGDClassifier(max_iter=1500) SGDClassifierModelBer.fit(X_train_Ber, y_train_Ber) #Calculating Prediction y_pred_Ber = SGDClassifierModelBer.predict(X_test_Ber) accuracyBer = accuracy_score(y_test_Ber, y_pred_Ber) print('The Accuracy with Berboulli Model :', accuracyBer * 100) # Applying Grid Search to find the best model and the best parameters from sklearn.model_selection import GridSearchCV parameters = [ { 'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000] }, ] grid_search_BOW = GridSearchCV(estimator=SGDClassifierModelBOW, param_grid=parameters, scoring='accuracy', cv=10, n_jobs=-1) grid_search_BOW = grid_search_BOW.fit(X_train_BOW, y_train_BOW) accuracyBOW = grid_search_BOW.best_score_ print('The Accuracy with BOW with grid search :', accuracyBOW * 100) grid_search_Ber = GridSearchCV(estimator=SGDClassifierModelBer, param_grid=parameters, scoring='accuracy', cv=10, n_jobs=-1) grid_search_Ber = grid_search_Ber.fit(X_train_BOW, y_train_BOW) accuracyBer = grid_search_Ber.best_score_ print('The Accuracy with Bernoulli with grid search :', accuracyBer * 100) from sklearn.metrics import classification_report print('Classification Report for BOW') print(classification_report(y_test_BOW, y_pred_BOW.round())) print('Classification Report for Bernoulli') print(classification_report(y_test_Ber, y_pred_Ber.round()))
train_x_normalize, test_x_normalize, train_y, test_y = train_test_split( X_kbest_features, y, train_size=0.8, test_size=0.2) #run LSA '''lsa = helper.run_lsa(train_x_normalize) train_x_normalize = lsa.transform(train_x_normalize) test_x_normalize = lsa.transform(test_x_normalize) val_kbest_features = lsa.transform(val_kbest_features)''' clf = SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3) acc = model_selection.cross_val_score(clf, X_kbest_features, y, cv=5, scoring='accuracy') print("accuracy", acc.mean()) start_time = time.time() clf.fit(train_x_normalize, train_y) print("--- %s runtime in seconds ---" % (time.time() - start_time)) # predict clf_pred = clf.predict(test_x_normalize) val_pred = clf.predict(val_kbest_features) #write to validation file helper.generate_prediction_csv(val_pred) # evaluation on testt set print(metrics.classification_report(test_y, clf_pred)) print(metrics.accuracy_score(test_y, clf_pred))
# Split dataset in testing and training datasets. MNIST already comes separated # into testing and training datasets: the first 60k images are training, and # the last 10k are testing. X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:] # Shuffle training sets. Good idea since some learning algorithms are sensitive # to the order of training sets. Not always a good idea, for example, on time # series data. import numpy as np shuffle_index = np.random.permutation(60000) X_train, y_train = X_train[shuffle_index], y_train[shuffle_index] # Creating a "5-detector". Create the target vectors for the classification # task: y_train_5 = (y_train == 5) # true for all 5s, False for all other digits. y_test_5 = (y_test == 5) # Pick a classifier to train. Will use Stochastic Gradient Descent. from sklearn.linear_model import SGDClassifier sgd_clf = SGDClassifier(random_state=42) sgd_clf.fit(X_train, y_train_5) # Use model to detect images of 5s: sgd_clf.predict([some_digit]) # output is true, the classifier properly pred- # icts that the image is 5.
#Load the dataset for line in HouseFile: matchObj = re.match( r'\d+ \w+ \w+,\w+,(\w+),\w+,(\d+),(\d+),(\d+),\w+,\w+ \w+ \d+ \d+:\d+:\d+ \w+ \d+,(\d+)', line, re.M|re.I) if matchObj: if not(matchObj.group(1) == 0) or (matchObj.group(1) == 0) or (matchObj.group(3) == 0) or (matchObj.group(4) == 0) or (matchObj.group(5) == 0): new_value = np.matrix([float(matchObj.group(2)),float(matchObj.group(3)),float(matchObj.group(4))]) info_list = np.concatenate((info_list, new_value), axis=0) new_price = np.matrix([float(matchObj.group(5))]) price_matrix = np.concatenate((price_matrix, new_price), axis=0) X = info_list y = price_matrix.transpose() list_y = np.array(y)[0].tolist() from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(X) # Don't cheat - fit only on training dataxs clf = SGDClassifier(loss="hinge", penalty="l2") clf.fit(X, list_y) SGDClassifier(alpha=0.01, average=True, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=1000, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False) # Create your views here.
def fit(self, X, y, *args, **kw): X = sp.csr_matrix(X) return SGDClassifier.fit(self, X, y, *args, **kw)
# partition the data into training and testing splits, using 75% # of the data for training and the remaining 25% for testing print("[INFO] constructing training/testing split...") (trainData, testData, trainLabels, testLabels) = train_test_split(np.array(data), labels, test_size=0.25, random_state=42) # train a Stochastic Gradient Descent classifier using a softmax # loss function and 10 epochs model = SGDClassifier(loss="log", random_state=42, n_iter=10) print("model", model, "trainData", trainData, "testData", testData, "trainLabels", trainLabels, "testLabels", testLabels) model.fit(trainData, trainLabels) # evaluate the classifier print("[INFO] evaluating classifier...") predictions = model.predict(testData) print("88", predictions) ##edited print(classification_report(testLabels, predictions, target_names=le.classes_)) # to demonstrate that our classifier actually "learned" from # our training data, randomly sample a few training images idxs = np.random.choice(np.arange(0, len(trainData)), size=(5, )) # loop over the training indexes for i in idxs: # predict class probabilities based on the extracted color # histogram
content = regex1.sub(' ', content) content = regex2.sub(' ', content) content = regex3.sub(' ', content) X_test.append(content) outF1 = open("unigram.output.txt", "w") outF2 = open("unigramtfidf.output.txt", "w") outF3 = open("bigram.output.txt", "w") outF4 = open("bigramtfidf.output.txt", "w") """ unigram """ unigram = CountVectorizer(stop_words=stopwords) X_train_unigram = unigram.fit_transform(X) X_test_unigram = unigram.transform(X_test) sgd1 = SGDClassifier(penalty='l1') sgd1.fit(X_train_unigram, Y_train) Y_test = sgd1.predict(X_test_unigram) for result in Y_test: outF1.write(str(result)) outF1.write("\n") outF1.close() """ unigram with tfidf """ tfidf_unigram = TfidfVectorizer(stop_words=stopwords) X_train_tf_unigram = tfidf_unigram.fit_transform(X) X_test_tf_unigram = tfidf_unigram.transform(X_test) sgd2 = SGDClassifier(penalty='l1') sgd2.fit(X_train_tf_unigram, Y_train) Y_test = sgd2.predict(X_test_tf_unigram) for result in Y_test:
#Train word2vec on test tweets # imdb_w2v.train(x_test) #Build test tweet vectors then scale test_vecs = np.concatenate([buildWordVector(z, n_dim) for z in x_test]) test_vecs = scale(test_vecs) # Use classification algorithm (i.e. Stochastic Logistic Regression) on training set, then assess model performance on test set from sklearn.linear_model import SGDClassifier scores = [] precision_scores=[] recall_scores=[] for x in range(10): print x lr = SGDClassifier(loss='log', penalty='l1') lr.fit(train_vecs, y_train) scores.append(lr.score(test_vecs, y_test)) precision_scores.append(precision_score(y_test, lr.predict(test_vecs))) recall_scores.append(recall_score(y_test, lr.predict(test_vecs))) print "accuracy: %f" % np.mean(scores) print "precision: %f" % np.mean(precision_scores) print "recall_scores: %f" % np.mean(recall_scores) # Try a naivebayes classifier from sklearn.naive_bayes import GaussianNB scores = [] precision_scores=[] recall_scores=[] for x in range(10):
from sklearn.linear_model import SGDClassifier X = [[0., 0.], [1., 1.]] y = [0, 1] clf = SGDClassifier(loss="hinge", penalty="l2") clf.fit(X, y) print clf.predict([[2., 2.]]) #To get the signed distance to the hyperplane use SGDClassifier.decision_function print clf.decision_function([[2., 2.]]) clf1 = SGDClassifier(loss="log").fit(X, y) clf1.predict_proba([[1., 1.]])
# learning_rate_init=0.1, shuffle=True) # classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, # hidden_layer_sizes=(30, 30), random_state=1, verbose=True) # mlp.fit(train_x, train_y) # score = mlp.score(vali_x, vali_y) # print(score) clf = SGDClassifier(loss="log", penalty="l2", max_iter=150, tol=1e-4, shuffle=True, verbose=1) clf.fit(train_x, train_y) score = clf.score(vali_x, vali_y) print(score) # with open('data_B.csv', 'r') as f: # data = f.readlines() # f.close() # raw_data = [] # for i in data: # i = i.strip('\n') # date, daylength, time, pid, label = i.split(',') # temp_x = [] # temp_y = []
from sklearn.datasets import load_iris from sklearn.cross_validation import train_test_split from sklearn import preprocessing iris=load_iris() X_iris, y_iris = iris.data, iris.target X, y = X_iris[:, :2], y_iris X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 33) scaler = preprocessing.StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) from sklearn.linear_model import SGDClassifier clf = SGDClassifier() clf.fit(X_train, y_train) from sklearn import metrics y_train_predict = clf.predict(X_train) print metrics.accuracy_score(y_train, y_train_predict) y_predict = clf.predict(X_test) print metrics.accuracy_score(y_test, y_predict) print metrics.classification_report(y_test, y_predict, target_names = iris.target_names)
X.pop('Parch') X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, random_state=0) #my_model_reg=DecisionTreeRegressor() #my_model_reg.fit(X_train,Y_train) #pred=my_model_reg.predict(X_valid) #print('MAE DTR: ',mean_absolute_error(pred,Y_valid)) class_model = DecisionTreeClassifier(max_leaf_nodes=54) class_model.fit(X_train, Y_train) pred_class = class_model.predict(X_valid) print('MAE DTC: ', mean_absolute_error(pred_class, Y_valid)) SGD_model = SGDClassifier() SGD_model.fit(X_train, Y_train) pred_SGD = SGD_model.predict(X_valid) print('MAE SGD: ', mean_absolute_error(pred_SGD, Y_valid)) RF_model = RandomForestClassifier(max_depth=17, random_state=0) RF_model.fit(X_train, Y_train) pred_RF = RF_model.predict(X_valid) print('MAE RF: ', mean_absolute_error(pred_RF, Y_valid)) KN_model = KNeighborsClassifier(n_neighbors=55, weights='distance', algorithm='auto') KN_model.fit(X_train, Y_train) pred_KN = KN_model.predict(X_valid) print('MAE KN: ', mean_absolute_error(pred_KN, Y_valid))
def main(): #TRAIN # with codecs.open("train.csv", 'r', encoding="utf-8-sig") as file: # lines = [ x.strip() for x in file.readlines() ] # train = map( lambda s: s.split(","), lines) # Y, X = zip(*train) # model = ResearchModel() # model.train(X, Y, n_epochs=2000) # model.save("weightings.pickle") #TEST # with codecs.open("validate.csv", 'r', encoding="utf-8-sig") as file: # lines = [ x.strip() for x in file.readlines() ] # test = map( lambda s: s.split(","), lines) # Y, X = zip(*test) # # This is messy but whatevs # model = ResearchModel() # model.train(X, Y, n_epochs=0) # This isn't actually training im only initializing values # # Set the models WEIGHTINGS # # model.load("weightings.pickle") # model.weightings = WEIGHTINGS; # accuracy = model.test(X,Y) # print("ACCURACY = %f" % accuracy) # print(model.weightings) #SLACK with codecs.open("all.csv", 'r', encoding="utf-8-sig") as file: lines = [x.strip() for x in file.readlines()] data = map(lambda s: s.split(","), lines) Y, X = zip(*data) # Load weghtings with open("weightings.pickle", "rb") as file: WEIGHTINGS = pickle.load(file) print(WEIGHTINGS) # Make Ymap Ymap = {} i = 0 for y in Y: if not y in Ymap: Ymap[y] = i i += 1 # Convert X, Y = map(lambda x: get_sentence_vector(x), X), map(lambda y: Ymap[y], Y) model = SGDClassifier() model.fit(X, Y) print("Model ready") while True: events = slack_client.rtm_read() for event in events: if ('channel' in event and 'text' in event and event['user'] == 'UCRPZ9R4K' and event.get('type') == 'message'): channel = event['channel'] text = event['text'].replace("\n", "").replace("?", "") input_vector = get_sentence_vector(text.lower()) prediction = model.predict([input_vector]) label = None for k in Ymap: if Ymap[k] == prediction[0]: label = k tokens = pos_tag(word_tokenize(text)) chunks = cp.parse(tokens) # Extract entities meta = [ " ".join([token[0] for token in ch.leaves()]) for ch in filter(lambda x: x.label() == "ENTITY", chunks.subtrees()) ] # Remove empty meta meta = [m for m in meta if len(m) > 0] print("Predicted intent: %s, entities: %s" % (label, meta)) slack_client.api_call( 'chat.postMessage', channel=channel, text= "I predicted the intent to be: %s with the entities: %s\n" % (label, ", ".join(meta)), as_user='******')
def fitness2(self, mode, solution, data, dummiesList, createDummies, normalize): matrix_length = len(np.unique(self.data[self.target])) if mode == 'sgd': model = SGDClassifier(class_weight='balanced', loss='modified_huber', random_state=1) elif mode == 'svr': model = SVC(kernel='linear', class_weight='balanced', probability=True) elif mode == 'rdf': model = SVC(kernel='rbf', class_weight='balanced', probability=True) elif mode == 'pol': model = SVC(kernel='poly', class_weight='balanced', probability=True) elif mode == 'rdc': # model = RandomForestClassifier(n_estimators=16, bootstrap=False, class_weight='balanced', random_state=1) # model = RandomForestClassifier(bootstrap=False, class_weight=None, criterion='entropy', # max_depth=12, max_features=0.2, max_leaf_nodes=None, # min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=20, # min_weight_fraction_leaf=0.0, n_estimators=512, n_jobs=-1, oob_score=False, # random_state=3, verbose=False, warm_start=False) simpleimputer = SimpleImputer(add_indicator=False, copy=True, fill_value=None, missing_values=np.nan, strategy="median", verbose=0) standardscaler = StandardScaler(copy=True, with_mean=True, with_std=True) randomforestclassifier = RandomForestClassifier( bootstrap=False, class_weight=None, criterion="gini", max_depth=None, max_features=0.21975649694764154, max_leaf_nodes=None, min_impurity_decrease=0, min_impurity_split=None, min_samples_leaf=2, min_samples_split=4, min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=1, oob_score=False, random_state=1, verbose=0, warm_start=False) model = Pipeline(memory=None, steps=[('simpleimputer', simpleimputer), ('standardscaler', standardscaler), ('randomforestclassifier', randomforestclassifier)]) elif mode == 'dtc': model = DecisionTreeClassifier(class_weight='balanced', random_state=1) elif mode == 'gdc': model = GradientBoostingClassifier(random_state=1) elif mode == 'etc': model = ExtraTreesClassifier(class_weight='balanced', random_state=1) elif mode == 'adc': model = AdaBoostClassifier(random_state=1) elif mode == 'bac': model = BaggingClassifier(random_state=1) elif mode == 'lda': model = LinearDiscriminantAnalysis() elif mode == 'qda': model = QuadraticDiscriminantAnalysis() elif mode == 'gnb': model = GaussianNB() elif mode == 'rrc': model = RidgeClassifier(class_weight='balanced') else: model = LogisticRegression(solver='liblinear', C=10.0, class_weight='balanced') k = model_selection.StratifiedKFold(5) if not any(solution): solution[random.randint(0, len(solution) - 1)] = True try: tab_data, tab_val = tab.get([int(x) for x in solution], self.tab_data, self.tab_vals) tab_val = np.array(tab_val) accuracy = (getTotalTruePositive(tab_val) + getTotalTrueNegative(tab_val)) / \ (getTotalTruePositive(tab_val) + getTotalTrueNegative(tab_val) + getTotalFalsePositive(tab_val) + getTotalFalseNegative(tab_val)) precision_tab = [] recall_tab = [] for i in range(len(tab_val)): a = getTruePositive(tab_val, i) / (getFalsePositive(tab_val, i) + getTruePositive(tab_val, i)) b = getTruePositive(tab_val, i) / (getFalseNegative(tab_val, i) + getTruePositive(tab_val, i)) precision_tab.append(a) recall_tab.append(b) precision = sum(precision_tab) / len(precision_tab) recall = sum(recall_tab) / len(recall_tab) fscore = 2 * (1 / ((1 / precision) + (1 / recall))) matrix = tab_val tmp = self.data.drop([self.target], axis=1) tmp = tmp.iloc[:, solution] cols = tmp.columns self.tab_find = self.tab_find + 1 except: matrix = np.zeros((matrix_length, matrix_length), dtype=int) X, y, cols = ready(self, solution, data, dummiesList, createDummies, normalize) originalclass = [] predictedclass = [] for train_index, test_index in k.split(X, y): # Split in X X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] if mode == ('knn' or 'dct' or 'gbc' or 'lda' or 'qda' or 'adc' or 'bac'): if mode == 'knn': model = KNeighborsClassifier( n_neighbors=int(len(X_train)**(1 / 2))) sm = SMOTE(sampling_strategy='auto') X_train, y_train = sm.fit_resample(X_train, y_train) model.fit(X_train, y_train) y_pred = model.predict(X_test) originalclass.extend(y_test) predictedclass.extend(y_pred) matrix += confusion_matrix(y_test, y_pred) accuracy = (getTotalTruePositive(matrix) + getTotalTrueNegative(matrix)) / \ (getTotalTruePositive(matrix) + getTotalTrueNegative(matrix) + getTotalFalsePositive(matrix) + getTotalFalseNegative(matrix)) precision, recall, fscore, support = s(originalclass, predictedclass, average='macro') self.tab_data, self.tab_vals = tab.add([int(x) for x in solution], matrix.tolist(), self.tab_data, self.tab_vals) self.tab_insert = self.tab_insert + 1 return accuracy, recall, precision, fscore, cols, matrix, self
X, y = fiasco.createXY(inputFile) X_train, X_test, y_train, y_test = fiasco.createTrainTestSplit() # Defining classification methods clf1 = NearestCentroid( metric='manhattan', shrink_threshold=500 ) # Performs better with the manhattan metric and a high shrink threshold. Remove params to see result with default params. clf1.fit(X_train, y_train) clf2 = SGDClassifier( loss="modified_huber", penalty="l1", max_iter=10000 ) # Modified huber has a higher tolerance to outliers than default. Penalty l1, adds a penalty equal the abosolute value. clf2.fit(X_train, y_train) clf3 = svm.SVC() clf3.fit(X_train, y_train) clf4 = svm.LinearSVC() clf4.fit(X_train, y_train) clf5 = KNeighborsClassifier() clf5.fit(X_train, y_train) # Plotting results plt = fiasco.plot_learning_curve( clf1, "Nearest Centroid, metric = manhattan, shrink threshold = 500", X, y) plt = fiasco.plot_learning_curve(clf2, "SGD Classifier", X, y)
stack_test /= n_folds stack = np.vstack([stack_train, stack_test]) df_stack['pack_tfidf_vec_lr_classfiy_{}'.format(label)] = stack[:, 0] ########################### SGD(随机梯度下降) ################################ print('sgd stacking') stack_train = np.zeros((len(train_uid), 1)) stack_test = np.zeros((len(test_uid), 1)) score_va = 0 for i, (tr, va) in enumerate(folds.split(train_feature, score)): print('stack:%d/%d' % ((i + 1), n_folds)) sgd = SGDClassifier(random_state=1017, loss='log') sgd.fit(train_feature[tr], score[tr]) score_va = sgd.predict_proba(train_feature[va])[:, 1] score_te = sgd.predict_proba(test_feature)[:, 1] print('得分' + str(mean_squared_error(score[va], sgd.predict(train_feature[va])))) stack_train[va, 0] = score_va stack_test[:, 0] += score_te stack_test /= n_folds stack = np.vstack([stack_train, stack_test]) df_stack['pack_tfidf_vec_sgd_classfiy_{}'.format(label)] = stack[:, 0] ########################### pac(PassiveAggressiveClassifier) ################################ print('sgd stacking') stack_train = np.zeros((len(train_uid), 1)) stack_test = np.zeros((len(test_uid), 1))
class PredictiveMarketVariables(MarketVariables): def __init__(self, shape, options): super(PredictiveMarketVariables, self).__init__(shape, options) self.shape = (self.shape[0] + 3, ) self.observations = deque(maxlen=15000) self.mids = deque(maxlen=15000) self.regressor = ElasticNet(warm_start=True, alpha=20, l1_ratio=0) self.classifier = SGDClassifier(warm_start=True, max_iter=100, alpha=0.001, l1_ratio=0.5, penalty='elasticnet') self.is_fitted = False self.train_k = 1 def transform(self, observation): if self.train_k == 1: prev_mid = (observation[0][Q_BID] + observation[0][Q_ASK]) / 2 else: prev_mid = (self.prev_ask + self.prev_bid) / 2 obs = MarketVariables.transform(self, observation) mid = (observation[0][Q_BID] + observation[0][Q_ASK]) / 2 self.mids.append(mid) self.observations.append((obs[0:4])) self.train_k += 1 if not self.is_fitted: if self.train_k == 1000: self.train() elif self.train_k % 10000 == 0: self.train() if self.is_fitted: mid_diff = (mid - prev_mid) / mid return obs + self.predict(obs[0:4] + (mid_diff, )) + (mid_diff, ) else: return obs + ( 0, 0, 0) # Zero percentage change and zero class (no change) def train(self): y = pd.Series(list(self.mids)) y_pct = y.pct_change().fillna(0) y_sign = np.sign(y_pct) X = np.array(self.observations) X = np.concatenate( [X, y_pct.shift(-1).fillna(0).values.reshape(-1, 1)], axis=1) X_scaled = MinMaxScaler(feature_range=(-1, 1)).fit_transform(X) #t = time.time() self.regressor.fit(X_scaled, y_pct) #print('Reg fit time:{:.2f}'.format(time.time()-t)) #t = time.time() self.classifier.fit(X_scaled, y_sign) #print('Class fit time:{:.2f}'.format(time.time()-t)) self.is_fitted = True def predict(self, obs): obs = np.array(obs).reshape(1, -1) return self.regressor.predict(obs)[0], self.classifier.predict(obs)[0]
print(x_train.shape, y_train.shape) print(x_test.shape, y_test.shape) # In[638]: #random training data import numpy as np X_train1 = np.random.permutation(X_train) Y_train1 = np.random.permutation(y_train) # In[639]: from sklearn.linear_model import SGDClassifier lm = SGDClassifier() lm.fit(X_train, y_train) # In[640]: #Find pridicted value from sklearn.model_selection import cross_val_predict y_pred = cross_val_predict(model, X_train1, Y_train1, cv=5) # In[641]: y_pred = y_pred.astype(int) # In[642]: y_pred