help='L1 or L2 penalization.') parser.add_argument('C', help='Regularization parameter.') parser.add_argument('fit_intercept', help='Whether to include a constant bias term in the loss function.') parser.add_argument('n_components_pca', help='n_components for PCA.') parser.add_argument('model_fname', help='Absolute path to pickle the fitted CvModel.') args = parser.parse_args() n_components_pca = None if args.n_components_pca == 'None' else int(args.n_components_pca) Id, X, y = extract_training_data('/nfs/raid13/babar/dchao/KaggleCS155/data/kaggle_train_tf_idf.csv') # trans/clf specs n_folds = 5 scaler = StandardScaler() pca = PCA(n_components=n_components_pca, whiten=True) trans = Pipeline([('scale_center', scaler),('pca', pca)]).fit(X) clf = LogisticRegression( penalty = args.penalty, C = float(args.C), fit_intercept = bool(args.fit_intercept), class_weight = 'auto' ) cv_clf = CvModel(n_folds, trans, clf) cv_clf.fit(X, y) joblib.dump(cv_clf, args.model_fname)
from sklearn.externals import joblib from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('n_estimators', help='n_estimators of RandomForestClassifier.') parser.add_argument( 'max_features', help='Number of features to split for RandomForestClassifier.') parser.add_argument('model_fname', help='Absolute path to pickle the fitted CvModel.') args = parser.parse_args() Id, X, y = extract_training_data( '/nfs/raid13/babar/dchao/KaggleCS155/data/kaggle_train_tf_idf.csv') # trans/clf specs n_folds = 5 scaler = StandardScaler().fit(X) clf = RandomForestClassifier(n_estimators=int(args.n_estimators), max_features=int(args.max_features)) cv_clf = CvModel(n_folds, scaler, clf) cv_clf.fit(X, y, [1. if yi == 0 else 3. for yi in y]) joblib.dump(cv_clf, args.model_fname)
if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('n_estimators', help='n_estimators of AdaBoostClassifier.') parser.add_argument('max_depth', help='max_depth of base estimator DecisionTreeClassifier.') parser.add_argument('learning_rate', help='learning_rate of each tree.') parser.add_argument('model_fname', help='Absolute path to pickle the fitted CvModel.') args = parser.parse_args() Id, X, y = extract_training_data('/nfs/raid13/babar/dchao/KaggleCS155/data/kaggle_train_tf_idf.csv') # trans/clf specs n_folds = 5 scaler = StandardScaler().fit(X) dtclf = DecisionTreeClassifier(max_depth=int(args.max_depth)) clf = AdaBoostClassifier( base_estimator=dtclf, n_estimators=int(args.n_estimators), learning_rate=float(args.learning_rate) ) cv_clf = CvModel(n_folds, scaler, clf) cv_clf.fit(X, y) joblib.dump(cv_clf, args.model_fname)
if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument('n_estimators', help='n_estimators of RandomForestClassifier.') parser.add_argument( 'max_features', help='Number of features to split for RandomForestClassifier.') parser.add_argument( 'C', help='Regularization parameter for L1 feature selection.') parser.add_argument('model_fname', help='Absolute path to pickle the fitted CvModel.') args = parser.parse_args() Id, X, y = extract_training_data( '/nfs/raid13/babar/dchao/KaggleCS155/data/kaggle_train_tf_idf.csv') # trans/clf specs n_folds = 5 scaler = StandardScaler() lasso = LinearSVC(C=float(args.C), penalty='l1', dual=False) trans = Pipeline([('center_scale', scaler), ('feature_selection', lasso)]).fit_transform(X, y) clf = RandomForestClassifier(n_estimators=int(args.n_estimators), max_features=int(args.max_features)) cv_clf = CvModel(n_folds, trans, clf) cv_clf.fit(X, y, input_trans=False) joblib.dump(cv_clf, args.model_fname)