def sgdfeature(self,data): newdata = pd.DataFrame() preproc = Pipeline([('fh',FeatureHasher( n_features=2**20,input_type='string'))]) ##for SGDClassifier newdata['app_id_specs'] = data['app_id'].values+data['app_domain'].values+data['app_category'].values newdata['app_dom_specs'] = data['app_domain'].values+data['app_category'].values newdata['site_id_specs'] = data['site_id'].values+data['site_domain'].values+data['site_category'].values newdata['site_dom_specs'] = data['site_domain'].values+data['site_category'].values # data['device'] = data['device_model'].values+(data['device_type'].values.astype(str))+(data['device_conn_type'].values.astype(str)) newdata['type'] = data['device_type'].values +data['device_conn_type'].values newdata['domain'] = data['app_domain'].values +data['site_domain'].values newdata['category'] = data['app_category'].values+data['site_category'].values newdata['pos_cat'] = data['banner_pos'].values.astype(str)+data['app_category'].values+data['site_category'].values newdata['pos_dom'] = data['banner_pos'].values.astype(str)+data['app_domain'].values+data['site_domain'].values # data['pos_id'] = data['banner_pos'].values.astype(str)+data['app_id'].values+data['site_id'].values newdata['hour'] = data['hour'].map(lambda x: datetime.strptime(x.astype(str),"%y%m%d%H")) newdata['dayoftheweek'] = newdata['hour'].map(lambda x: x.weekday) newdata['day'] = newdata['hour'].map(lambda x: x.day) newdata['hour'] = newdata['hour'].map(lambda x: x.hour) newdata = newdata.drop('hour',axis=1) newdata = newdata.astype(str) del data X_dict = np.asarray(newdata) self.X_train = preproc.fit_transform(X_dict) return self.X_train
class MachineLearning(object): def __init__(self): # Initialize classifier and vectorizer self.clf = Pipeline([('tfidf', TfidfVectorizer(min_df=1, ngram_range=(1, 2))), ('clf', MultinomialNB(alpha=.01)), ]) def init_training(self): self.x_train = [] self.y_train = [] def add_training_data(self, data, label): self.x_train.append(data) self.y_train.append(label) # Train classifier # Can also use grid search to optimize accuracy, like ''' parameters = {'tfidf__ngram_range': [(1, 1), (1, 2)], 'clf__alpha': (.01, .001), } gs_clf = GridSearchCV(clf, parameters, n_jobs=-1) ''' def train(self): self.clf.fit(self.x_train, self.y_train) # Predict result # We can roughly estimate the accuracy using cross validation, like ''' result = clf.predict(test_dc + test_marvel) baseline = [0 for x in range(len(test_dc))] + [1 for x in range(len(test_marvel))] print np.sum(result == baseline) / float(len(result)) ''' def predict(self, data): return self.clf.predict([data])[0]
def KFOLDTEST(self, text, sent): k_fold = KFold(n=len(text), n_folds=6) pipeline = Pipeline( [ ("vectorizer", CountVectorizer(ngram_range=(1, 2), tokenizer=self.tokenize_data)), ("tfidf", TfidfTransformer(norm="l2", smooth_idf=False, use_idf=False)), ("classifier", OneVsOneClassifier(LinearSVC())), ] ) scores = [] for train_indices, test_indices in k_fold: # print('Train: %s | test: %s' % (train_indices, test_indices)) train_text = text[train_indices] train_y = sent[train_indices] test_text = text[test_indices] test_y = sent[test_indices] pipeline.fit(train_text, train_y) score = pipeline.score(test_text, test_y) scores.append(score) score = sum(scores) / len(scores) print ("scores ", scores, " Score ", score) return score
def simple_classification_without_cross_fold_validation(x, y, estimator, scoring): ''' Run normal SVM classification without cross-fold validation. ''' x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # 30% reserved for validation # feature selection since we have a small sample space fs = SelectPercentile(scoring, percentile=20) pipeline = Pipeline([('featureselector', fs), ('scaler', StandardScaler()), ('estimator', estimator)]) pipeline = OneVsRestClassifier(pipeline) clfer = pipeline.fit(x_train, y_train) y_predict_train = clfer.predict(x_train) print "%% Accuracy on training set: %2.3f" % metrics.accuracy_score(y_train, y_predict_train) y_predict_test = clfer.predict(x_test) print "\n%% Accuracy on testing set: %2.3f" % metrics.accuracy_score(y_test, y_predict_test) print "\nClassification Report:" print metrics.classification_report(y_test, y_predict_test) print "Confusion Matrix:" print metrics.confusion_matrix(y_test, y_predict_test)
def train_clf(self): pipeline = Pipeline([ ("tfidf", TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)), ("svc", LinearSVC(C=100)) ]) pipeline.fit(self.dataset.data, self.dataset.target) return pipeline
def test_one_rf(): Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl") print "training data loaded" print_label_frequency(ytrain_raw) ############# create the pipeline pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=lambda x:x,max_features=3000)), ('tfidf', TfidfTransformer()), ('rf', RandomForestClassifier(n_estimators=500, max_depth=200, min_samples_split=10, oob_score=True, n_jobs=-1,verbose=1,class_weight='balanced')), ]) ############# train pipeline.fit(Xtrain_raw,ytrain_raw) ############# check result rf = pipeline.steps[-1][1] rf.oob_score_ ############# training error ytrain_predict = pipeline.predict(Xtrain_raw) print classification_report(y_true=ytrain_raw,y_pred=ytrain_predict) print confusion_matrix(y_true=ytrain_raw,y_pred=ytrain_predict) ############# testing error Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl") ytest_predict = pipeline.predict(Xtest_raw) accuracy_score(y_true=ytest_raw,y_pred=ytest_predict) print classification_report(y_true=ytest_raw,y_pred=ytest_predict)
def test_set_pipeline_step_none(): # Test setting Pipeline steps to None X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) def make(): return Pipeline([("m2", mult2), ("m3", mult3), ("last", mult5)]) pipeline = make() exp = 2 * 3 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline.set_params(m3=None) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) assert_dict_equal( pipeline.get_params(deep=True), {"steps": pipeline.steps, "m2": mult2, "m3": None, "last": mult5, "m2__mult": 2, "last__mult": 5}, ) pipeline.set_params(m2=None) exp = 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) # for other methods, ensure no AttributeErrors on None: other_methods = ["predict_proba", "predict_log_proba", "decision_function", "transform", "score"] for method in other_methods: getattr(pipeline, method)(X) pipeline.set_params(m2=mult2) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline = make() pipeline.set_params(last=None) # mult2 and mult3 are active exp = 6 assert_array_equal([[exp]], pipeline.fit(X, y).transform(X)) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) assert_raise_message(AttributeError, "'NoneType' object has no attribute 'predict'", getattr, pipeline, "predict") # Check None step at construction time exp = 2 * 5 pipeline = Pipeline([("m2", mult2), ("m3", None), ("last", mult5)]) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]]))
def runCrossValidationTest(classifier_name, classifier_args=None, ngram=2, folds=5): if classifier_args is None: classifier_args = {} classifier = valid_classifiers[classifier_name](**classifier_args) X, y = load_non_preprocessed_data() # confusion = numpy.array([[0,0,0],[0,0,0],[0,0,0]]) ml_pipeline = Pipeline([ ('tfidf', TfidfVectorizer(sublinear_tf=True, ngram_range=(1,ngram))), ('Classifier', classifier), ]) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y, test_size = 0.25, random_state=0) ml_pipeline.fit(X_train, y_train) predictions = ml_pipeline.predict(X_test) confusion = confusion_matrix(y_test, predictions) f1 = f1_score(y_test, predictions, pos_label=None, average = 'micro') precision = precision_score(y_test, predictions, pos_label=None, average = 'micro') recall = recall_score(y_test, predictions, pos_label=None, average = 'micro') print(" >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") print("F1 score: " + str(f1)) print("precision score: " + str(precision)) print("recall score: " + str(recall)) print(confusion) numpy.savetxt("data/test_results_confusion_matrix_" + classifier_name+".csv", confusion, delimiter=",") return ((f1, precision, recall))
class Regressor(BaseEstimator): def __init__(self): self.clf = Pipeline([ ("RF", RandomForestRegressor(n_estimators=200, max_depth=15, n_jobs=N_JOBS))]) self.scaler = StandardScaler() self.agglo = FeatureAgglomeration(n_clusters=500) def fit(self, X, y): y = y.ravel() n_samples, n_lags, n_lats, n_lons = X.shape self.scaler.fit(X[:, -1].reshape(n_samples, -1)) X = X.reshape(n_lags * n_samples, -1) connectivity = grid_to_graph(n_lats, n_lons) self.agglo.connectivity = connectivity X = self.scaler.transform(X) X = self.agglo.fit_transform(X) X = X.reshape(n_samples, -1) self.clf.fit(X, y) def predict(self, X): n_samples, n_lags, n_lats, n_lons = X.shape X = X.reshape(n_lags * n_samples, -1) X = self.scaler.transform(X) X = self.agglo.transform(X) X = X.reshape(n_samples, -1) return self.clf.predict(X)
def test(): target_label = [u'weather', u'audio',u'pic',u'calculate',u'music', u'poem'] training_text_raw = [] training_label = [] with open ('./training_source.csv','r') as f: for line in f.readlines(): line = line.strip().split('\t') if len(line) > 1 and line[1] in target_label: training_text_raw.append(unicode(line[0],"utf-8")) training_label.append(line[1]) print training_label training_text = [] for text in training_text_raw: seg_text = seg(text) training_text.append(seg_text) text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', MultinomialNB()), ]) scores = cross_validation.cross_val_score(text_clf, training_text, training_label, cv=8) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) text_clf.fit(training_text, training_label) while True: k_text = raw_input("\nPlease input:") if k_text == "exit": break print text_clf.predict([seg(unicode(k_text,'utf-8'))])
def main(): corpus = capitalCorpus() transformer = textTransformer() continents = np.array(os.listdir('txt/')) for continent_dir in enumerate(continents): corpus = getText(continent_dir,corpus,transformer) #Split corpus into training set and test set train_X, test_X, train_Y, test_Y = train_test_split(corpus.data, corpus.target, test_size = 0.25, random_state=54321) #Build a pipeline clf = MultinomialNB() count_vect = CountVectorizer() tfidf_transformer = TfidfTransformer(use_idf = True) clf_pipe = Pipeline( [ ('vectorizer', count_vect), ('tfidf', tfidf_transformer), ('classifier', clf) ] ).fit(train_X, train_Y) predicted = clf_pipe.predict(test_X) print(classification_report(test_Y, predicted))
def clasificador(self,X_train, y_train, X_test, target_names, y_test,all_labels): lb = preprocessing.MultiLabelBinarizer() Y = lb.fit_transform(y_train) classifier = Pipeline([ ('vectorizer',CountVectorizer(strip_accents='unicode')), ('tfidf',TfidfTransformer()), ('to_dense', DenseTransformer()), ('clf',OneVsRestClassifier(GaussianNB()))]) classifier.fit(X_train,Y) predicted = classifier.predict(X_test) etiquetas = lb.inverse_transform(predicted) for i in range(0,len(etiquetas)): etiquetas[i]=list(etiquetas[i]) valoresMacro = self.macro(etiquetas,y_test) valoresMicro = self.micro(etiquetas, y_test)
def svcDictVector(): recipeData = getRecipeData() labels = [recipe['cuisine'] for recipe in recipeData] ingredientsFixtures = [sorted(set(e['ingredients'])) for e in recipeData] for i, w in enumerate(ingredientsFixtures): ingredientsFixtures[i] = dict(zip(w, [1] * len(w))) pipeline = Pipeline([ ('dict', DictVectorizer()), ('variance', VarianceThreshold()), ('tfidf', TfidfTransformer()), ('bayes', svm.LinearSVC()), ]) pipeline.fit(ingredientsFixtures, labels) print pipeline testRecipes = getTestData() testIngredientsFixtures = [sorted(set(e['ingredients'])) for e in testRecipes] for i, w in enumerate(testIngredientsFixtures): testIngredientsFixtures[i] = dict(zip(w, [1] * len(w))) predictions = pipeline.predict(testIngredientsFixtures) outputPercentCorrect(predictions) copyAndOutput(predictions, testRecipes)
def train(param_search=False): data = load_files(download()) y = [data.target_names[t] for t in data.target] # The random state on the LR estimator is fixed to the most arbitrary value # that I could come up with. It is biased toward the middle number keys on # my keyboard. clf = Pipeline([('tfidf', TfidfVectorizer(min_df=2, dtype=float, sublinear_tf=True, ngram_range=(1, 2), strip_accents='unicode')), ('lr', LogisticRegression(random_state=623, C=5000))]) if param_search: params = {'tfidf__ngram_range': [(1, 1), (1, 2)], 'lr__C': [1000, 5000, 10000]} print("Starting parameter search for review sentiment classification") # We ignore the original folds in the data, preferring a simple 5-fold # CV instead; this is intended to get a working model, not results for # publication. gs = GridSearchCV(clf, params, cv=5, refit=True, n_jobs=-1, verbose=2) gs.fit(data.data, y) print("Parameters found:") pprint(gs.best_params_) print("Cross-validation accuracy: %.3f" % gs.best_score_) return gs.best_estimator_ else: print("Training logistic regression for movie review polarity") return clf.fit(data.data, y)
def create_union_model(params=None): def preprocessor(tweet): tweet = tweet.lower() for k in emo_repl_order: tweet = tweet.replace(k, emo_repl[k]) for r, repl in re_repl.items(): tweet = re.sub(r, repl, tweet) return tweet.replace("-", " ").replace("_", " ") tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor, analyzer="word") ling_stats = LinguisticVectorizer() all_features = FeatureUnion( [('ling', ling_stats), ('tfidf', tfidf_ngrams)]) #all_features = FeatureUnion([('tfidf', tfidf_ngrams)]) #all_features = FeatureUnion([('ling', ling_stats)]) clf = MultinomialNB() pipeline = Pipeline([('all', all_features), ('clf', clf)]) if params: pipeline.set_params(**params) return pipeline
def predictfactors(self): pipeline = Pipeline([("imputer", Imputer(strategy='mean', axis=0)), ("logistic", LogisticRegression()) ]) predict = pipeline.fit(self.X_train, self.y_train).predict(self.X_test) firstfactor=[] secondfactor=[] thirdfactor=[] res = np.array(pipeline.named_steps['logistic'].coef_ * self.X_test.iloc[[0]]) threemaxindexes = np.array((-res).argsort().ravel()) #[3 1 5 4 2 6 0] print(res) print(threemaxindexes) print(self.names) #sys.exit() for i in range(0,len(self.X_test)): res = np.array(pipeline.named_steps['logistic'].coef_ * self.X_test.iloc[[i]]) threemaxindexes = np.array((-res).argsort().ravel()) firstfactor.append(self.names[threemaxindexes[0]]) secondfactor.append(self.names[threemaxindexes[1]]) thirdfactor.append(self.names[threemaxindexes[2]]) for i in range(0,len(self.X_test)): print([self.df[self.idcol][i], predict[i], firstfactor[i], secondfactor[i], thirdfactor[i]])
def train_optimal_classifier(clf, X, y, params, scale=False, folds=1000): pipeline = 0 combined_features = FeatureUnion([("pca", PCA()), ("univ_select", SelectKBest())]) if scale: pipeline = Pipeline([("minmax", MinMaxScaler()), ("features", combined_features), ("clf", clf)]) else: pipeline = Pipeline([("features", combined_features), ("clf", clf)]) param_grid = dict(features__pca__n_components=[0,1,3,6,9,12,15], features__univ_select__k=list(range(0, len(X[0])))) for k, v in params.iteritems(): param_grid["clf__" + k] = v grid_search = GridSearchCV( pipeline, param_grid=param_grid, cv=cross_validation.StratifiedShuffleSplit(y, folds), verbose=1, scoring='f1', error_score=0, refit=True, ) grid_search.fit(X, y) return (grid_search.best_estimator_, grid_search.best_score_, pipeline.fit(X,y))
class Model10(Model): def __init__(self): pass def fit(self, Xmask, y): pr = prepare.Prepare_0(model=10, preproc=1, min_df=1, use_svd=False, tfidf=2, stemmer=0) (X_all_df,_,BP,params) = pr.load_transform(update=False) names = list(X_all_df.columns) X_all = np.asarray(X_all_df) self.X_all, self.names = X_all, names clf0 = GaussianNB() clf1 = MultinomialNB(alpha=0.8) clf2 = BernoulliNB(alpha=1, binarize=0.01) clf = clf1 self.rd = Pipeline([ ("trans", Transformer(names=self.names, X_all=X_all, BP=BP)), #("scaler",StandardScaler(with_mean=False)), ("est", clf) ]) self.rd.fit(Xmask,np.asarray(y)) return self def predict_proba(self, Xmask): return self.rd.predict_proba(Xmask) def predict(self, Xmask): return self.rd.predict(Xmask) def starter(self): print "Model10 starter" self.fit(np.arange(100),np.arange(100))
class Classifier: def __init__(self, clf, scaler=None, selector=False): if scaler: if selector: self.clf = Pipeline([ ('scaler', scaler), ('selector', SelectFromModel(SELECTOR_POOL['extra_trees_classifier'], .001)), ('classifier', clf) ]) else: self.clf = Pipeline([ ('scaler', scaler), ('classifier', clf) ]) else: if selector: self.clf = Pipeline([ ('selector', SelectFromModel(SELECTOR_POOL['extra_trees_classifier'], .001)), ('classifier', clf) ]) else: self.clf = clf def __str__(self): if isinstance(self.clf, Pipeline): return ', '.join(type(v).__name__ for k, v in self.clf.steps) return type(self.clf).__name__ def fit(self, X, y): self.clf.fit(X, y) def predict(self, X): return self.clf.predict(X)
def op_machine_predict(self): """ 与machine_predict的区别在于,op_machine_predict在每个window中都通过grid_search 的方法确定最后的参数。该模型的训练及预测步骤如下: 对于每一个窗口的数据 1) 对输入的ta_factors进行标准化的处理 2) Feature selection:方法可选择 3) PCA降维 4) 训练并Grid_Search """ ta_factors, labels = self.set_factors_labels() svc = SVC(kernel='linear') min_max_scaler = preprocessing.MinMaxScaler() pre = pd.DataFrame(index=ta_factors.index[self.window_size:], columns=['pre_label', 'pre_actual']) Cs = range(10, 100, 10) gammas = range(5, 100, 5) n_s = self.window_size for num in range(0, len(ta_factors)-n_s): ta_factors_scaled = min_max_scaler.fit_transform(ta_factors.ix[num:num+n_s+1]) x_train = ta_factors_scaled[:-1] x_test = ta_factors_scaled[-1:] y_train = labels[num:num+n_s] y_test = labels[num+n_s] # ta_factors_scaled_pca = pca.fit_transform(ta_factors_scaled) rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y_train, 2)) clf = Pipeline([('feature_select', rfecv), ('svm', SVC())]) # estimator = GridSearchCV(clf, dict(svm__C=Cs, svm__gamma=gammas)) pre_model = clf.fit(x_train, y_train) pre['pre_label'][num] = pre_model.predict(x_test).item() pre['pre_actual'][num] = y_test pre['pre_acu'] = pre['pre_label'] == pre['pre_actual'] self.prediction_results = pre return pre
def train_polynomialRegressionModel(X, y, degree=2, interaction_only=False, include_bias=True): """ Train a polynomial model using Linear Regression Pipeline with degrees """ model = Pipeline([("poly", PolynomialFeatures(degree=degree)), ("linear", LinearRegression(fit_intercept=False))]) model = model.fit(X, y) return model
def train_regressor(data, X_columns, y_show=y_init+y_curr): X = data.loc[:,X_columns] ys = data.loc[:, [i for i in y_show if i not in X_columns]] print() for n_trees in [256]: #list(range(4, 16)) + [18,20] + [2**n for n in range(4, 12)]: #[n for n in range(4, 64)]:#[2**n for n in range(1, 12)]: forest = Pipeline(steps=[ ('forest', ExtraTreesRegressor( #RandomForestRegressor( n_estimators=n_trees, n_jobs=min(n_trees, 62), oob_score=True, bootstrap=True))]) start = time() forest.fit(X, ys)#new_ys) end = time() print(n_trees, forest.steps[0][1].oob_score_, end-start) print() print("%.5g seconds to train regressor" % (end-start)) print() y_names = ys.columns X_names = X.columns return [forest, y_names, X_names]
def classify(text, label): #~ Testing purpose: 10-fold cross validation cv = KFold(n = len(label), n_folds = 10) n_c = [100, 200, 500, 1000, 2000, 5000, 10000] for i in n_c: clf = Pipeline([ ('vect', TfidfVectorizer( analyzer='word', ngram_range=(1, 1), stop_words = 'english', lowercase=True, token_pattern=r'\b\w+\b', tokenizer=tokenize_doc, min_df = 1)), ('dim_reduction', TruncatedSVD(n_components=i)), #~ ('feature_selection', #~ SelectKBest( #~ chi2, #~ k=35)), ('classification', LogisticRegression()) #~ SVC(kernel = 'linear')) ]) print "len(label) ", len(label), " | text ", len(text) print "" clf.fit(np.asarray(text), np.asarray(label)) cv_score = cross_val_score(clf, text, label, cv = cv, verbose = 1) print "Log Reg | n_c = ", i print "Accuracy List ", cv_score, " | Avg Accuracy ", np.mean(cv_score)
class Vectorizer(): def __init__(self, hash=False, min_df=0.015, max_df=0.9): """ `min_df` is set to filter out extremely rare words, since we don't want those to dominate the distance metric. `max_df` is set to filter out extremely common words, since they don't convey much information. """ if hash: args = [ ('vectorizer', HashingVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=Tokenizer())), ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)), ('feature_reducer', TruncatedSVD(n_components=400)), ('normalizer', Normalizer(copy=False)) ] else: args = [ ('vectorizer', CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=Tokenizer(), min_df=min_df, max_df=max_df)), ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)), ('normalizer', Normalizer(copy=False)) ] self.pipeline = Pipeline(args) def vectorize(self, docs, train=False): if train: return self.pipeline.fit_transform(docs) else: return self.pipeline.transform(docs) @property def vocabulary(self): return self.pipeline.named_steps['vectorizer'].get_feature_names()
def train(docs): """ Trains and serializes (pickles) a vectorizing pipeline based on training data. `min_df` is set to filter out extremely rare words, since we don't want those to dominate the distance metric. `max_df` is set to filter out extremely common words, since they don't convey much information. """ pipeline = Pipeline([ ('vectorizer', CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=Tokenizer(), min_df=0.015, max_df=0.9)), ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)), ('feature_reducer', TruncatedSVD(n_components=100)), ('normalizer', Normalizer(copy=False)) ]) print('Training on {0} docs...'.format(len(docs))) pipeline.fit(docs) PIPELINE = pipeline print('Serializing pipeline to {0}'.format(PIPELINE_PATH)) pipeline_file = open(PIPELINE_PATH, 'wb') pickle.dump(pipeline, pipeline_file) print('Training complete.')
def test_multiple_cols(self): t = bt.Split_transform(input_features=["a","b"],output_feature="res") df = pd.DataFrame.from_dict([{"a":"a b","b":"c d","c":3},{"a":"word1","b":"word2"}]) transformers = [("split_transform",t)] p = Pipeline(transformers) df2 = p.transform(df) self.assertTrue(len(df2["res"][0]) == 4)
def test_multiple_cols_numbers_ignored(self): t = bt.Split_transform(input_features=["a","b"],ignore_numbers=True,output_feature="res") df = pd.DataFrame.from_dict([{"a":"a b","b":"c 1","c":3}]) transformers = [("split_transform",t)] p = Pipeline(transformers) df2 = p.transform(df) self.assertTrue(len(df2["res"][0]) == 3)
class ModelPipeline(object): def __init__(self, clf): self.columns =[] self.pipeline = Pipeline([ ('clf', clf) ]) def fit(self, X_train, y_train): self.pipeline.fit(X_train, y_train) self.columns = list(X_train.columns) def predict(self, X_test): return self.pipeline.predict(X_test) def feat_importances(self, n=10, string=True): imp = self.pipeline.steps[0][1].feature_importances_ if string: return ''.join('%s: %s%%\n' % (self.columns[feat], round( imp[feat] * 100, 3)) for feat in np.argsort(imp)[-1:-(n+1):-1]) else: return self.columns[np.argsort(imp)[-1:-(n+1):-1]], \ sorted(imp)[-1:-(n+1):-1] def grid_search(self, X, y): parameters = { 'clf__n_estimators': [100, 200, 300] , 'clf__max_features': ['sqrt', 50, 80], 'clf__max_depth': [None, 50, 100], 'clf__oob_score': [False, True], 'clf__random_state':[29], 'clf__class_weight':['balanced', None, 'balanced_subsample'], 'clf__min_samples_split': [2, 10, 20] } grid_search = GridSearchCV(self.pipeline, parameters, n_jobs=-1, verbose=1, scoring = "recall") print("Performing grid search...") print("pipeline:", [name for name, _ in self.pipeline.steps]) print("parameters:") pprint(parameters) t0 = time() grid_search.fit(X, y) print("done in %0.3fs" % (time() - t0)) print() print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) return best_parameters
def test_sklearn_pipeline(self): df = pd.DataFrame.from_dict([{"a":"something","b":1},{"a":"something2"}]) t = bt.Exclude_features_transform(excluded=["b"]) transformers = [("exclude_transform",t)] p = Pipeline(transformers) df2 = p.fit_transform(df) self.assertEquals(len(df2.columns),1)
def predict(): pipeline = Pipeline([ ('min/max scaler', MinMaxScaler(feature_range=(0.0, 1.0))), ('neural network', Classifier(layers=[Layer("ExpLin", units=5), Layer("Softmax")], n_iter=25))]) X = np.load('All_features.npz')['arr_0'] D = np.load('Akunin_features.npz')['arr_0'] all_samples = [1]*141 + [0]*123 y = np.array(all_samples) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.0, random_state=0) pipeline.fit(X_train, y_train) pickle.dump(pipeline, open('NeuralNet_model.pkl', 'wb')) prediction = pipeline.predict(D) probs = pipeline.predict_proba(D) gradation = {1.01: 5, 0.9: 4, 0.8: 3, 0.7: 2, 0.6: 1} ress1 = [] simple_predicts = [] scale_predicts = [] for i in prediction: simple_predicts.append(i[0]) for i in probs: scale_predicts.append(i[1]*10) compare = [] for u in gradation: if i[1] < u: compare.append(gradation[u]) ress1.append(min(compare)) return simple_predicts, scale_predicts
def model_pipeline(X, Y, model_params): if (model_params["name"] == 'logistic'): if model_params["bin"]: pclf = Pipeline([ ('vect', CountVectorizer(stop_words='english', max_features=model_params["num_words"], ngram_range=model_params['ngram'], binary=model_params['bin'])), ('norm', Normalizer()), ('clf', LogisticRegression(solver='lbfgs')), ]) else: pclf = Pipeline([ ('vect', CountVectorizer(stop_words='english', max_features=model_params["num_words"], ngram_range=model_params['ngram'], binary=model_params['bin'])), ('tfidf', TfidfTransformer()), ('norm', Normalizer()), ('clf', LogisticRegression(solver='lbfgs')), ]) elif (model_params["name"] == 'tree'): pclf = Pipeline([ ('vect', CountVectorizer(stop_words='english', max_features=model_params["num_words"], ngram_range=model_params['ngram'], binary=model_params['bin'])), ('tfidf', TfidfTransformer()), ('norm', Normalizer()), ('clf', tree.DecisionTreeClassifier(max_depth=model_params['depth'])), ]) elif((model_params["name"] == 'svm')) : pclf = Pipeline([ ('vect', CountVectorizer(stop_words='english', max_features=model_params["num_words"], ngram_range=model_params['ngram'] , binary = model_params['bin'] )), ('tfidf', TfidfTransformer()), ('norm', Normalizer()), ('scaler', StandardScaler(with_mean=False)), ('clf', svm.SVC(max_iter=model_params["iter"], gamma="scale", probability=model_params["prob"])), ]) elif ((model_params["name"] == 'gnb')): pclf = Pipeline([ ('vect', CountVectorizer(stop_words='english', max_features=model_params["num_words"], ngram_range=model_params['ngram'] , binary = model_params['bin'] )), # ('tfidf', TfidfTransformer()), ('norm', Normalizer()), ('clf', MultinomialNB()), ]) elif ((model_params["name"] == 'stacklog')): pclf = StackingLogistic(model_params) pclf.fit(X, Y) # print("xddqsdfd") return pclf
from sklearn.feature_extraction.text import TfidfTransformer from sklearn.naive_bayes import MultinomialNB import numpy as np from sklearn.pipeline import Pipeline from sklearn import metrics filepath = unicode('20news-bydate-train','utf-8') rawData = datasets.load_files(filepath,encoding="latin1") count_vect = CountVectorizer() x_train_counts = count_vect.fit_transform(rawData.data) tfidf_transformer = TfidfTransformer() tfid = tfidf_transformer.fit_transform(x_train_counts) clf = MultinomialNB().fit(tfid,rawData.target) test_clf = Pipeline([ ('vect',CountVectorizer()), ('tfid',TfidfTransformer()), ('clf',MultinomialNB()), ]) test_clf.fit(rawData.data , rawData.target) testData = datasets.load_files("20news-bydate-test",encoding="latin1") predicted = test_clf.predict(testData.data) result = metrics.classification_report(testData.target,predicted,target_names = testData.target_names) print(result)
import views_utils.dbutils as dbutils sys.path.insert(0, "../../../osa") from osa.wrapper_sm import SMLogit import osa.utils as osa uname = "VIEWSADMIN" prefix = "postgresql" db = "views" port = "5432" hostname = "VIEWSHOST" connectstring = dbutils.make_connectstring(prefix, db, uname, hostname, port) rf_500 = RandomForestClassifier(n_estimators=500, n_jobs=10) scaler = StandardScaler() pipe_rf_500 = Pipeline([('scaler', scaler), ('rf', rf_500)]) output_schema = "landed_test" output_table = "osa_pgm_acled_protest_eval_calib_pr" models = [{ "dir_pickles": "$SNIC_TMP/osa/pickles/osa_pgm_acled_protest_eval_calib_pr/pgm_acled_protest_eval_calib_logit_fullsample_pr", "estimator": SMLogit(), "features": [ "l2_acled_dummy_pr", "l3_acled_dummy_pr", "l4_acled_dummy_pr", "l5_acled_dummy_pr", "l6_acled_dummy_pr", "l7_acled_dummy_pr", "l8_acled_dummy_pr", "l9_acled_dummy_pr", "l10_acled_dummy_pr", "l11_acled_dummy_pr", "l12_acled_dummy_pr", "q_1_1_l2_acled_dummy_pr", "q_1_1_l3_acled_dummy_pr", "l1_acled_dummy_pr",
'SURF_medissim': SURF(learned_metric_func=produce_learned_metric_func), 'SURF': SURF(), } # Initialize dictionary for storing results. res_dict = dict.fromkeys(rbas.keys()) for key in res_dict.keys(): res_dict[key] = np.empty(NUM_FEATURES_TO_SELECT_LIM, dtype=np.float) # Go over RBAs. for rba_name in rbas.keys(): print("### Testing {0} ###".format(rba_name)) # Initialize next pipeline. clf_pipeline = Pipeline([('scaling', StandardScaler()), ('rba', rbas[rba_name]), ('clf', clf)]) # Go over values on x axis. for num_features_to_select in np.arange(1, NUM_FEATURES_TO_SELECT_LIM + 1): print("{0}/{1}".format(num_features_to_select, NUM_FEATURES_TO_SELECT_LIM)) # Set parameter. clf_pipeline.set_params( rba__n_features_to_select=num_features_to_select) # Compute score of 10 runs of 10 fold cross-validation. score = np.mean( cross_val_score(clf_pipeline, data,
def __init__(self, **kwargs): """ set _node_transformer, _edge_transformer, tdifNodeTextVectorizer """ FeatureDefinition.__init__(self) nbTypes = self._getTypeNumber(kwargs) block_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ( "xywh", Pipeline([ ('selector', NodeTransformerXYWH_v2()), #v1 ('xywh', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('xywh', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False) ) #use in-place scaling ])), ( "neighbors", Pipeline([ ('selector', NodeTransformerNeighbors()), #v1 ('neighbors', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('neighbors', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False) ) #use in-place scaling ])), ( "1hot", Pipeline([('1hot', Node1HotFeatures() ) #does the 1-hot encoding directly ])) ]) grid_line_transformer = GridLine_NodeTransformer_v2() self._node_transformer = TransformerListByType( [block_transformer, grid_line_transformer]) edge_BB_transformer = FeatureUnion( [ #CAREFUL IF YOU CHANGE THIS - see cleanTransformers method!!!! ("1hot", Pipeline([('1hot', Edge1HotFeatures(PageNumberSimpleSequenciality())) ])), ("boolean", Pipeline([('boolean', EdgeBooleanFeatures_v2())])), ( "numerical", Pipeline([ ('selector', EdgeNumericalSelector()), #v1 ('numerical', StandardScaler(copy=False, with_mean=True, with_std=True)) #use in-place scaling ('numerical', QuantileTransformer(n_quantiles=self.n_QUANTILES, copy=False) ) #use in-place scaling ])) ]) edge_BL_transformer = Block2GridLine_EdgeTransformer() edge_LL_transformer = GridLine2GridLine_EdgeTransformer() self._edge_transformer = TransformerListByType([ edge_BB_transformer, edge_BL_transformer, edge_BL_transformer, # useless but required edge_LL_transformer ]) self.tfidfNodeTextVectorizer = None #tdifNodeTextVectorizer
plt.scatter(x_train.T[0], x_train.T[1], c=y_train.ravel(), edgecolors='k', s=40, cmap=plt_dark) # 全部数据 plt.xlabel(u'特征属性1', fontsize=15) plt.ylabel(u'特征属性2', fontsize=15) plt.xlim(x1_min, x1_max) plt.ylim(x2_min, x2_max) plt.grid(True) plt.title(u'鸢尾花数据的决策树分类', fontsize=18) plt.show() #参数优化 pipe = Pipeline([('mms', MinMaxScaler()), ('skb', SelectKBest(chi2)), ('pca', PCA()), ('decision', DecisionTreeClassifier())]) # 参数 parameters = { "skb__k": [1, 2, 3, 4], "pca__n_components": [0.5, 1.0], "decision__criterion": ["gini", "entropy"], "decision__max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] } #数据 x_train2, x_test2, y_train2, y_test2 = x_train1, x_test1, y_train1, y_test1 #模型构建 gscv = GridSearchCV(pipe, param_grid=parameters) #模型训练 gscv.fit(x_train2, y_train2) #算法的最优解
from sklearn.model_selection import StratifiedShuffleSplit from sklearn.preprocessing import PolynomialFeatures from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import QuantileTransformer from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline from sklearn.metrics import accuracy_score from sklearn.preprocessing import Normalizer from sklearn.decomposition import PCA import time start_computing_time = time.time() estimators = [("PCA", PCA()), ("nmz", Normalizer()), ("LSVC", LinearSVC())] model = Pipeline(steps=estimators) param_grid = {"LSVC__loss": ['hinge'], "LSVC__class_weight": ['balanced']} Grid = GridSearchCV(model, param_grid=param_grid, cv=5) Grid.fit(X_train, Y_train) Best_Grid_estimator = Grid.best_estimator_ Best_Grid_estimator.fit(X_train, Y_train) print(Best_Grid_estimator) pred = Best_Grid_estimator.predict(X_test) print("Accuracy of predictions:") print(accuracy_score(Y_test, pred))
from sklearn.preprocessing import StandardScaler from nlp4musa2020.dataloaders.alf200k import ALF200KLoader from nlp4musa2020.dataloaders.alf200k import genre_target_labels import nlp4musa2020.evaluators as evaluators dataloader = ALF200KLoader( path='data/processed/dataset-lfm-genres.pickle', load_feature_groups=[ 'audio', ], text_vectorizers=None, target=genre_target_labels(), ) pipeline = Pipeline([ ('scaler', StandardScaler()), ('model', RandomForestClassifier(n_jobs=-1)), ]) evaluator = GridEvaluator( parameters={ 'model__n_estimators': [10, 100, 300], }, grid_parameters=evaluators.grid_parameters_genres(), ) result_handlers = [ result_handlers.print_gridsearch_results, ]
def train_and_fit_models(df_preprocessed, filename_input, param_grid=[], save_all=False, personal_note=""): """ Method to train model(s) on a preprocessed dataset and return the gridsearch object. Parameters ---------- df_preprocessed : pd.DataFrame() filename_input : str param_grid : list(dict(str:list())) save_all: Bool Boolean value to save DataFrame and settings personal_note : str String value to add to the filename to make recognition of the saved files easier. Returns ------- Complete gridsearch object, where `gridsearchobject.best_estimator_` will give the best model. """ # Delete rows where target value is NaN df = df_preprocessed.copy() df = drop_nan_from_specific_columns(df, settings.train['Y_VALUE']) # Make X and y dataset X = df.drop(settings.Y_TARGET_COLS, axis=1) y = df[settings.train['Y_VALUE']] # Split X and y into train and test dataset X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=settings.train['TEST_SIZE'], random_state=settings.train['RANDOM_STATE']) # Construct basic pipeline for gridsearch pl_gs_total = Pipeline([('clf', LinearRegression())]) # Placeholder Estimator # Make/use param_grid for all classifiers and hyper parameters if len(param_grid) == 0: param_grid_total = [{'clf': [LinearRegression()], 'clf__normalize': settings.train['GRIDSEARCH_NORMALIZE'], }, {'clf': [Ridge()], 'clf__alpha': settings.train['GRIDSEARCH_ALPHA']}, {'clf': [Lasso()], 'clf__alpha': settings.train['GRIDSEARCH_ALPHA']}, {'clf': [KNeighborsRegressor()], 'clf__n_neighbors': settings.train['GRIDSEARCH_NEIGHBORS']}, {'clf': [XGBRegressor()], 'clf__gamma': settings.train['GRIDSEARCH_GAMMA'], 'clf__n_estimators': settings.train['GRIDSEARCH_N_ESTIMATORS']}, ] else: param_grid_total = param_grid # Initiate gridsearch object grid_search_total = GridSearchCV(pl_gs_total, param_grid_total, cv=settings.train['CROSS_VALIDATE'], scoring=settings.train['MODEL_SCORING'], return_train_score=True) # Fit gridsearch object on to the data grid_search_total.fit(X_train, y_train) # Get the best estimator (based on best train score) print(f"The model with the best train score is:\n{grid_search_total.best_estimator_['clf']}") # Calculate the RMSE for best estimator print(f"This model has a train score (RMSE) of: {rmse_from_neg_mean_squared_error(grid_search_total.best_score_)}") print( f"This model has a test score (RMSE) of: {rmse_from_gridsearch_best_estimator(grid_search_total, X_test, y_test)}") # Save if save_all: # Save the best estimator of the gridsearch in a Pickle file suffix_datetime = datetime.strftime(datetime.now(), format='%Y%m%d%H%M') filename_output = f'best_model_{suffix_datetime}_{personal_note}' pickle.dump(grid_search_total.best_estimator_, open(f'{settings.DATAPATH}{filename_output}.pickle', 'wb')) # Save log of train step df_log = pd.DataFrame({"Model": [split_clf_and_params(grid_search_total.best_estimator_['clf'])[0]], "Gridsearch_Params": [ split_clf_and_params(grid_search_total.best_estimator_['clf'])[1]], "Train_RMSE": [rmse_from_neg_mean_squared_error(grid_search_total.best_score_)], "Test_RMSE": [rmse_from_gridsearch_best_estimator(grid_search_total, X_test, y_test)], "Number_of_features": [len(X.columns)], "Y_value": settings.train['Y_VALUE'], "Input_filename": [filename_input], "Output_filename": [filename_output], }) df_log.to_csv(settings.train['LOG_PATH'] + filename_output + '.csv', index=False, header=True) return grid_search_total
class SimpleModel: def __init__(self, train_dataset, val_dataset): tfidf = TfidfVectorizer( min_df=0.01, max_df=0.9, max_features=10000, # stop_words='english', use_idf=True, ngram_range=(1, 3), tokenizer=tokenize_text) self.model = Pipeline([("Vectorizer", tfidf), ("model", XGBClassifier(seed=utils.RANDOM_SEED)) ]) self.search_parameters = { "Vectorizer__min_df": [0, 0.05, 0.1], "Vectorizer__max_df": [0.9, 0.95, 1], "Vectorizer__max_features": [1000, 5000, 10000], "Vectorizer__ngram_range": [(1, 2), (1, 3), (1, 4)] } self.train_X = train_dataset["opinion"] self.train_y = train_dataset["outcome"] self.val_X = val_dataset["opinion"] self.val_y = val_dataset["outcome"] split = PredefinedSplit( np.concatenate( (np.repeat(-1, len(self.train_y)), np.repeat(0, len(self.val_y))))) self.search = GridSearchCV(self.model, self.search_parameters, cv=split, scoring="f1", verbose=2, n_jobs=4) def fit(self): X = np.concatenate((self.train_X, self.val_X)) y = np.concatenate((self.train_y, self.val_y)) self.search.fit(X, y) self.model = self.search.best_estimator_ return self.search.cv_results_ def evaluate(self, dataset): return self.model.score(dataset["opinion"], dataset["outcome"]) def load(self, result_dataset): params = result_dataset.sort_values("rank_test_score") params["param_Vectorizer__ngram_range"] = params["param_Vectorizer__ngram_range"]\ .apply(lambda s: s.replace("(", "").replace(")", "").split(", "))\ .apply(lambda q: (int(q[0]), int(q[1]))) tfidf = TfidfVectorizer( min_df=params["param_Vectorizer__min_df"].values[0], max_df=params["param_Vectorizer__max_df"].values[0], max_features=params["param_Vectorizer__max_features"].values[0], # stop_words='english', use_idf=True, ngram_range=params["param_Vectorizer__ngram_range"].values[0], tokenizer=tokenize_text) self.model = Pipeline([("Vectorizer", tfidf), ("model", XGBClassifier())]) self.model.fit(self.train_X, self.train_y) def predict(self, documents): if type(documents) == str: return self.model.predict_proba([documents]) return self.model.predict_proba(documents) def get_model_vocabulary(self): return self.model.named_steps["Vectorizer"].vocabulary_ def get_model_max_ngrams(self): return self.model.get_params()["Vectorizer__ngram_range"][1]
sentiments.append(int(elements[0])) # SVM Solution # Data_txt preproccessing - tokenization, selecting the best features vectorizer = CountVectorizer(tokenizer=negation_handling.tokenize, ngram_range=(1, 1), max_df=0.5, lowercase=False) tfidfTans = TfidfTransformer(use_idf=True, sublinear_tf=True, smooth_idf=False, norm='l2') classifier = Pipeline([ ('vect', vectorizer), ('tfidf', tfidfTans), ('feature_selection', SelectPercentile(chi2, percentile=93)), ('clf', LinearSVC(C=0.10000000000000001, multi_class='ovr')), ]) print "With negation handling: " skf = cross_validation.StratifiedKFold(sentiments, n_folds=5) scores = cross_validation.cross_val_score(classifier, sentences, sentiments, cv=skf, scoring='f1') # print "Without negation handling: " # vectorizer = CountVectorizer(tokenizer=None, ngram_range=(1, 1), max_df=0.5, lowercase=False) # tfidfTans = TfidfTransformer(use_idf=True, sublinear_tf=True, smooth_idf=False, norm='l2') #
header=None, low_memory=False) df, _ = prep_data('') print(df.describe()) print('=== linear regression ===') regr = linear_model.LinearRegression() print('r2 = %.2f' % cross_val_score( regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='r2').mean()) print('rmse = %.2f' % np.sqrt(-1 * cross_val_score(regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='neg_mean_squared_error')).mean()) regr = Pipeline([('trans', preprocessing.StandardScaler()), ('regr', regr)]) print('r2 = %.2f' % cross_val_score( regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='r2').mean()) print('rmse = %.2f' % np.sqrt(-1 * cross_val_score(regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10, scoring='neg_mean_squared_error')).mean()) print('=== ridge ===') regr = linear_model.Ridge(alpha=.05) print(cross_val_score(regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10).mean()) print('=== lasso ===') regr = linear_model.Lasso(alpha=.05) print(cross_val_score(regr, df.iloc[:, :-1], df.iloc[:, -1:], cv=10).mean()) print('=== Poly Linear ===')
from sklearn.model_selection import cross_validate import itertools from sklearn.model_selection import GridSearchCV # enter file name of data you want to analyze filename = '' full_dataset = pd.read_pickle("./" + str(filename) + " SVD.pkl") print(full_dataset.head()) X = full_dataset.drop(['X', 'Y', 'Z', 'F'], axis=1) y = full_dataset['F'] scaler = StandardScaler() mlp = MLPRegressor(early_stopping=True) pipeline = Pipeline([('transformer', scaler), ('estimator', mlp)]) parameters = { 'estimator__learning_rate': ['constant'], 'estimator__learning_rate_init': [0.005], 'estimator__hidden_layer_sizes': [x for x in itertools.product((20, 50, 100, 150, 200), repeat=2)], 'estimator__activation': ['tanh', 'relu', 'logistic'], 'estimator__max_iter': [5000], 'estimator__batch_size': [20, 50, 100, 150, 200] } clf = GridSearchCV(pipeline, parameters, cv=5) clf.fit(X, y) print("Best parameter (CV score=%0.3f):" % clf.best_score_)
X_train = corpus_train_tfidf_kpca X_test = corpus_test_tfidf_kpca y_train = train_category y_test = test_category #Initialize K-Fold for cross validation K = 5 kfold = KFold(n_splits=K, random_state=seed) #Create Pipeline estimators = [] estimators.append(('Normalizer', Normalizer())) estimators.append(('knn_clf', KNeighborsClassifier())) reg_knn_pipe1 = Pipeline(estimators) reg_knn_pipe1.set_params(knn_clf__algorithm='ball_tree',knn_clf__weights='uniform') #Create a grid search over n_neighbors values parameters = { 'knn_clf__n_neighbors' : np.arange(5,50) } estimator_knnreg = GridSearchCV(reg_knn_pipe1, parameters, cv=kfold) #Evaluate the grid search and print best regressor print('Starting Grid Search') estimator_knnreg.fit(X_train, y_train) alphas = [x['knn_clf__n_neighbors'] for x in estimator_knnreg.cv_results_['params']] means = [x for x in estimator_knnreg.cv_results_['mean_test_score']] stds = [x for x in estimator_knnreg.cv_results_['std_test_score']]
column_params={'diag': { 'top_n': 200, 'min_support': 0 }})), ('imputer', Imputer(missing_values='NaN', strategy='median')), ('scaler', StandardScaler())] model_stack = [ fe1 + [('lr', LogisticRegression(class_weight="balanced"))], fe2 + [('rf', RandomForestClassifier(random_state=1, class_weight="balanced"))], fe2 + [('xgb', XGBClassifier(seed=1, scale_pos_weight=(1 / np.mean(ydata_train) - 1)))] ] model_stack = [(m[-1][0], Pipeline(steps=m)) for m in model_stack] # hyperparameter tuning for each model individually ss = ShuffleSplit(n_splits=5, train_size=0.25, random_state=1) tuning_constants = { 'scoring': 'roc_auc', 'cv': ss, 'verbose': 1, 'refit': False } grid_search_tuning_arg = tuning_constants.copy() rand_search_tuning_arg = dict(tuning_constants, **{ 'random_state': 1, 'n_iter': 20 }) tuning_types = {
return stemmed def tokenize(text): text = "".join([ch for ch in text if ch not in string.punctuation]) tokens = nltk.word_tokenize(text) stems = stem_tokens(tokens, stemmer) return stems #obtain stop words stop_words = text.ENGLISH_STOP_WORDS #define pipeline for tokenizing, feature extraction, feature selection, and softSVC parameters = [0.001, 0.01, 0.1, 1, 10, 100, 1000] for x in range(0, 7): text_clf = Pipeline([('vect', CountVectorizer(tokenizer=tokenize, stop_words=stop_words, analyzer='word')), ('tfidf', TfidfTransformer()), ('dimensionality_reduction', TruncatedSVD(n_components=50, random_state=42)), ('clf', SGDClassifier(alpha=parameters[x]))]) scores = cross_validation.cross_val_score(text_clf, data, target, cv=5, scoring='f1_weighted') print scores
class AgeBucket(BaseEstimator, TransformerMixin): def fit(self, X, y=None): return self def transform(self, X): return pd.DataFrame(X.Age // 15 * 15) class RelativesOnboard(BaseEstimator, TransformerMixin): def fit(self, X, y=None): return self def transform(self, X): return pd.DataFrame(X.SibSp + X.Parch) column_transformer = ColumnTransformer([("Age_Bucket", AgeBucket(), ["Age"]), ("Relatives_On_board", RelativesOnboard(), ["SibSp", "Parch"]), ("one_hot_enc", OneHotEncoder(), ["Pclass","Sex","Embarked"])], remainder='passthrough') preprocess_Pipeline = Pipeline([("col_trans", column_transformer), ("imputer", SimpleImputer(strategy="median"))]) X_train = preprocess_Pipeline.fit_transform(train_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]) y_train = train_data["Survived"] for i in [SVC(gamma="auto"), RandomForestClassifier(n_estimators=100, random_state=42)]: i.fit(X_train, y_train) y_pred = i.predict(X_train) print(i, '\n', confusion_matrix(y_train, y_pred))
def construct_pipeline(selected_features, selected_classifier): feature_pipelines = construct_feature_pipelines(selected_features) return Pipeline([('features', FeatureUnion(feature_pipelines)), ('tfidf', TfidfTransformer()), ('clf', classifier_dict[selected_classifier])])
masker = NiftiMasker(mask_img=imag_mask, standardize=True, memory="nilearn_cache", memory_level=5) X = masker.fit_transform(dataset) # Apply our condition_mask X = X[condition_mask] # PREDICTION FUNCTION from sklearn.svm import SVC svc = SVC(kernel='linear', max_iter=1000) # FEATURE SELECTION feature_selection = SelectKBest(f_classif, k=500) anova_svc = Pipeline([('anova', feature_selection), ('svc', svc)]) anova_svc.fit(X, y) #y_pred = anova_svc.predict(X) ########################################################################## # NESTED CROSS VALIDATION from sklearn.model_selection import GridSearchCV k_range = [[15, 50, 150, 300], [500, 1000, 3000, 5000]] #cv_scores = cross_val_score(anova_svc, X, conditions,) # Print the results def run_CV(params):
grid.fit(X_train_scaled, y_train) print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_)) print("Best set score: {:.2f}".format(grid.score(X_test_scaled, y_test))) print("Best parameters: ", grid.best_params_) # Best cross-validation accuracy: 0.98 # Best set score: 0.97 # Best parameters: {'C': 1, 'gamma': 1} # ---------------------------------------------------------------------------------------------- # One of the correct approaches using pipelines # ---------------------------------------------------------------------------------------------- from sklearn.pipeline import Pipeline pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())]) pipe.fit(X_train, y_train) print("Test score: {:.2f}".format(pipe.score(X_test, y_test))) # Test score: 0.95 - same as in the initial example # ---------------------------------------------------------------------------------------------- # Pipelines for grid searches # ---------------------------------------------------------------------------------------------- param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100], 'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]} # dictionary keys contain of pipeline component name 'svm', double underscore '__', and parameter name 'C' and 'gamma' grid = GridSearchCV(pipe, param_grid=param_grid, cv=5) grid.fit(X_train, y_train) print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))
value['proportion_pay_to_salary'] = (total_payments - salary) / salary # If either value is NaN, set proportion_payments_to_salary to NaN else: value['proportion_pay_to_salary'] = 'NaN' ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. estimators = [('scaler', MinMaxScaler()), ('skb', SelectKBest()), ('gnb', GaussianNB())] pipe = Pipeline(estimators) param_grid = dict(skb__k=range(2,7)) ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using tester.py cv = StratifiedShuffleSplit(labels, test_size=0.3, random_state = 42) gs = GridSearchCV(pipe, param_grid=param_grid, cv=cv, scoring='f1_weighted') gs.fit(features, labels) clf = gs.best_estimator_
#%% Validation for Part2 dims1 = [2, 4, 5, 7, 10, 15, 20, 22, 26] grid = { 'pca__n_components': dims1, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } pca = PCA(random_state=5) mlp = MLPClassifier(solver='lbfgs', activation='identity', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('pca', pca), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(loans_X, loans_Y) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'Loan_dim_red_ownNN.csv') dims2 = [2, 3, 4, 5, 6, 7, 8, 9, 10] #dims2 = [2,10] grid = {'pca__n_components': dims2} pca = PCA(random_state=5) mlp = MLPClassifier(solver='lbfgs', activation='logistic', alpha=0.1, hidden_layer_sizes=(50, ), max_iter=2000,
print('=' * 80) print("NearestCentroid (aka Rocchio classifier)") results.append(benchmark(NearestCentroid())) # Train sparse Naive Bayes classifiers print('=' * 80) print("Naive Bayes") results.append(benchmark(MultinomialNB(alpha=.01))) results.append(benchmark(BernoulliNB(alpha=.01))) print('=' * 80) print("LinearSVC with L1-based feature selection") # The smaller C, the stronger the regularization. # The more regularization, the more sparsity. results.append(benchmark(Pipeline([ ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False, tol=1e-3))), ('classification', LinearSVC(penalty="l2"))]))) # make some plots indices = np.arange(len(results)) results = [[x[i] for x in results] for i in range(4)] clf_names, score, training_time, test_time = results training_time = np.array(training_time) / np.max(training_time) test_time = np.array(test_time) / np.max(test_time) plt.figure(figsize=(12, 8)) plt.title("Score") plt.barh(indices, score, .2, label="score", color='navy')
# 1. 构建一个管道流对象,定义数据处理的顺序 """ Ridge参数说明: alpha=1.0: ppt上的λ,正则化系数, fit_intercept=True: 模型是否训练截距项,默认为训练(True), normalize=False:在做模型训练之前,是否做归一化操作,一般不改动, max_iter=None:模型求解的迭代最大次数,默认表示不限制, tol=1e-3: 模型收敛条件,当损失函数的变化值小于该值的时候,介绍迭代更新, solver="auto":给定求解方式, RidgeCV: alphas: 给定alpha的取值范围 cv: 给定做几折交叉验证 """ model = Pipeline(steps=[ ('Poly', PolynomialFeatures()), # 给定第一步操作,名称为Poly ('Linear', RidgeCV(alphas=[0.1, 0.2, 0.3], cv=5)) # 给定第二步操作,名称为Linear ]) # 1.2 Pipeline对象设置参数 # Poly__degree: Poly是定义Pipeline对象的时候给定的步骤名称,degree是对应步骤对象中的属性名称, 中间是两个连续的下划线 model.set_params(Poly__degree=2) model.set_params(Linear__normalize=True) # 2. 模型训练(先调用第一步进行数据处理,然后再调用第二步做模型训练) # 假设我们的步骤是n步,那么前n-1步做的操作是: fit + transform, 最后一步做的操作是fit model.fit(X_train, Y_train) """ model.fit等价于linear.fit(poly.fit_transform(x_train,y_train),y_train) """ print("多项式模型:{}".format(model.get_params()['Poly']))
# (e.g., country code, profession, species, etc.), then one-hot encoding will result in a # large number of input features. This may slow down training and degrade performance. # If this happens, you will want to produce denser representations called embeddings, # but this requires a good understanding of neural networks (see Chapter 14 for more details). attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False) housing_extra_attribs = attr_adder.transform(housing.values) #As you can see, there are many data transformation steps # that need to be executed in the right order. Fortunately, # Scikit-Learn provides the Pipeline class to help with such sequences # of transformations. Here is a small pipeline for the numerical attributes: #The Pipeline constructor takes a list of name/estimator pairs defining a sequence of steps num_pipeline = Pipeline([ ('imputer', Imputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) #When you call the pipeline’s fit() method, it calls fit_transform() # sequentially on all transformers, passing the output of each call as # the parameter to the next call, until it reaches the final estimator, # for which it just calls the fit() method. housing_num_tr = num_pipeline.fit_transform(housing_num) num_attributes = list(housing_num) cat_attributes = ["ocean_proximity"] num_pipeline = Pipeline([ ('selector', DataFrameSelector(num_attributes)), ('imputer', Imputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()),
class ModelTransformer(TransformerMixin): def __init__(self, model): self.model = model def fit(self, *args, **kwargs): self.model.fit(*args, **kwargs) return self def transform(self, X, **transform_params): return pd.DataFrame(self.model.predict(X)) abstract_pipeline = Pipeline([ ('extract_text', DataFrameGenericColumnExtractor('abstract')) , ('count_vec', HashingVectorizer(analyzer="word", stop_words='english', n_features=1000, binary=False)) , ('tfidf_vec', TfidfTransformer() ) #TfidfVectorizer(analyzer="word", stop_words='english', max_df=0.5, max_features=1000, min_df=2, use_idf=True)) ]) authors_pipeline = Pipeline([ ('extract_text', DataFrameGenericColumnExtractor('keywords')) , ('count_vec', HashingVectorizer(analyzer="word", stop_words='english', binary=False)) , ('tfidf_vec', TfidfTransformer() ) #TfidfVectorizer(analyzer="word", stop_words='english', max_df=0.5, max_features=1000, min_df=2, use_idf=True)) ]) groups_pipeline = Pipeline([ ('extract_text', DataFrameGenericColumnExtractor('groups')) , ('count_vec', HashingVectorizer(analyzer="word", stop_words='english', binary=False)) , ('tfidf_vec', TfidfTransformer() ) #TfidfVectorizer(analyzer="word", stop_words='english', max_df=0.5, max_features=1000, min_df=2, use_idf=True)) ])
def common_test_model_tfidf_vectorizer_pipeline_cls( self, kind=None, verbose=False): if kind == 'stop': if ort_version.startswith('1.4'): # regression with stopwords in onnxruntime 1.4 stopwords = ['theh'] else: stopwords = ['the', 'and', 'is'] else: stopwords = None X_train = numpy.array([ "This is the first document", "This document is the second document.", "And this is the third one", "Is this the first document?", ]).reshape((4, 1)) y_train = numpy.array([0, 1, 0, 1]) if kind is None: model_pipeline = Pipeline([ ('vectorizer', TfidfVectorizer(stop_words=stopwords, lowercase=True, use_idf=True, ngram_range=(1, 3), max_features=30000)), ]) elif kind == 'cls': model_pipeline = Pipeline([('vectorizer', TfidfVectorizer(stop_words=stopwords, lowercase=True, use_idf=True, ngram_range=(1, 3), max_features=30000)), ('feature_selector', SelectKBest(k=10)), ('classifier', SVC(class_weight='balanced', kernel='rbf', gamma='scale', probability=True))]) elif kind == 'stop': model_pipeline = Pipeline([ ('vectorizer', CountVectorizer(stop_words=stopwords, lowercase=True, ngram_range=(1, 2), max_features=30000)), ]) elif kind == 'reg': model_pipeline = Pipeline([('vectorizer', TfidfVectorizer(stop_words=stopwords, lowercase=True, use_idf=True, ngram_range=(1, 3), max_features=30000)), ('feature_selector', SelectKBest(k=10)), ('classifier', SVR(kernel='rbf', gamma='scale'))]) else: raise AssertionError(kind) model_pipeline.fit(X_train.ravel(), y_train) initial_type = [('input', StringTensorType([None, 1]))] model_onnx = convert_sklearn(model_pipeline, "cv", initial_types=initial_type, options={SVC: { 'zipmap': False }}) if kind in (None, 'stop'): exp = [model_pipeline.transform(X_train.ravel()).toarray()] elif kind == 'cls': exp = [ model_pipeline.predict(X_train.ravel()), model_pipeline.predict_proba(X_train.ravel()) ] elif kind == 'reg': exp = [model_pipeline.predict(X_train.ravel()).reshape((-1, 1))] sess = InferenceSession(model_onnx.SerializeToString()) got = sess.run(None, {'input': X_train}) if verbose: voc = model_pipeline.steps[0][-1].vocabulary_ voc = list(sorted([(v, k) for k, v in voc.items()])) for kv in voc: print(kv) for a, b in zip(exp, got): if verbose: print(stopwords) print(a) print(b) assert_almost_equal(a, b)
from sklearn.model_selection import GridSearchCV df = pd.read_csv( 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None) X = df.loc[:, 2:].values y = df.loc[:, 1].values le = LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1) pipe_svc = Pipeline([('sc1', StandardScaler()), ('clf', SVC(random_state=1))]) param_range = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000] param_grid = [{ 'clf__C': param_range, 'clf__kernel': ['linear'] }, { 'clf__C': param_range, 'clf__gamma': param_range, 'clf__kernel': ['rbf'] }] gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1)
splitters.append(convert_dataset.kfoldsplit(tagged_sents, k=k)) splits = [] for _ in range(k): train_sents = [] eval_sents = [] for splitter in splitters: train, eval = next(splitter) train_sents.extend(train) eval_sents.extend(eval) splits.append( transform_to_dataset(train_sents) + transform_to_dataset(eval_sents)) clf = Pipeline([('vectorizer', DictVectorizer(sparse=False)), ('classifier', LogisticRegression())]) start = datetime.now() scores = np.array( Parallel(-1)(delayed(fit_and_score)(clf, *split) for split in splits)) end = datetime.now() acc = "%0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2) print("Accuracy:", acc) timestamp = start.isoformat(" ", "seconds") commit_id = subprocess.run( "git rev-parse --short HEAD".split(" "), capture_output=True).stdout.decode("utf-8").strip() os.makedirs("results", exist_ok=True)
data_raw=pd.read_csv(r'C:\Users\Mohit\Desktop\web\ML\ML_Project_1\ML Project 1\housing_data.csv') # data_raw.hist(bins=50, figsize=(20, 15)) # plt.show() # print(data_raw.info()) #print(data_raw.head()) split=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42) for train_index,test_index in split.split(data_raw,data_raw['CHAS']): strat_train_set=data_raw.loc[train_index] strat_test_set=data_raw.loc[test_index] my_pipeline=Pipeline( [('imputer',SimpleImputer(strategy="median")), ('std_scaler',StandardScaler())] ) strat_train_set_temp=strat_train_set.drop('MEDV',axis=1) some_data=strat_train_set_temp.iloc[:5] housing_tr=my_pipeline.fit_transform(strat_train_set_temp) housing_labels=strat_train_set['MEDV'].copy() some_labels=housing_labels.iloc[:5] prepared_dt=my_pipeline.transform(some_data) model=LinearRegression() model.fit(housing_tr,housing_labels) predicted_labels=model.predict(prepared_dt) print(list(some_labels),predicted_labels)