def test_feature_union_parallel(): # test that n_jobs work for FeatureUnion X = JUNK_FOOD_DOCS fs = FeatureUnion([("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char"))]) fs_parallel = FeatureUnion( [("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char"))], n_jobs=2 ) fs_parallel2 = FeatureUnion( [("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char"))], n_jobs=2 ) fs.fit(X) X_transformed = fs.transform(X) assert_equal(X_transformed.shape[0], len(X)) fs_parallel.fit(X) X_transformed_parallel = fs_parallel.transform(X) assert_equal(X_transformed.shape, X_transformed_parallel.shape) assert_array_equal(X_transformed.toarray(), X_transformed_parallel.toarray()) # fit_transform should behave the same X_transformed_parallel2 = fs_parallel2.fit_transform(X) assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray()) # transformers should stay fit after fit_transform X_transformed_parallel2 = fs_parallel2.transform(X) assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray())
def test_set_feature_union_steps(): mult2 = Mult(2) mult2.get_feature_names = lambda: ["x2"] mult3 = Mult(3) mult3.get_feature_names = lambda: ["x3"] mult5 = Mult(5) mult5.get_feature_names = lambda: ["x5"] ft = FeatureUnion([("m2", mult2), ("m3", mult3)]) assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]]))) assert_equal(["m2__x2", "m3__x3"], ft.get_feature_names()) # Directly setting attr ft.transformer_list = [("m5", mult5)] assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) assert_equal(["m5__x5"], ft.get_feature_names()) # Using set_params ft.set_params(transformer_list=[("mock", mult3)]) assert_array_equal([[3]], ft.transform(np.asarray([[1]]))) assert_equal(["mock__x3"], ft.get_feature_names()) # Using set_params to replace single step ft.set_params(mock=mult5) assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) assert_equal(["mock__x5"], ft.get_feature_names())
def test_feature_union(self): """Tests that combining multiple featurizers works as expected""" modules = ["bag-of-words", "entities"] modules_list, _ = modules_to_dictionary(modules) feature_union = FeatureUnion(modules_list) feature_union.fit(texts_entities, outcomes) feature_union.transform(["unknown"])
def test_set_feature_union_steps(): mult2 = Mult(2) mult2.get_feature_names = lambda: ['x2'] mult3 = Mult(3) mult3.get_feature_names = lambda: ['x3'] mult5 = Mult(5) mult5.get_feature_names = lambda: ['x5'] ft = FeatureUnion([('m2', mult2), ('m3', mult3)]) assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]]))) assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names()) # Directly setting attr ft.transformer_list = [('m5', mult5)] assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) assert_equal(['m5__x5'], ft.get_feature_names()) # Using set_params ft.set_params(transformer_list=[('mock', mult3)]) assert_array_equal([[3]], ft.transform(np.asarray([[1]]))) assert_equal(['mock__x3'], ft.get_feature_names()) # Using set_params to replace single step ft.set_params(mock=mult5) assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) assert_equal(['mock__x5'], ft.get_feature_names())
def pca(x, y, test_x, n_features=-1): if n_features == -1: n_features = int(np.ceil(np.sqrt(x.shape[1]))) pca = PCA(n_components=n_features) selection = SelectKBest(k=n_features/2) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) combined_features.fit(x, y) return combined_features.transform(x), combined_features.transform(test_x)
def prediction(train_df, test_df, MODEL): print "... start prediction" fu_obj = FeatureUnion(transformer_list=features.feature_list) train_X = fu_obj.fit_transform(train_df) train_y = train_df["Sales"].as_matrix() clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"], param_grid=clf_dict[MODEL]["paramteters"], n_jobs=3, scoring=rmspe, verbose=1) clf.fit(train_X, train_y) print clf.best_score_ index_sr = pd.Series(get_split_feature_list(fu_obj), name="Feature") if hasattr(clf.best_estimator_, "coef_"): coef_sr = pd.Series(clf.best_estimator_.coef_, name="Coef") coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature") coeffile = SUBMISSION + "coef_%s.csv" % MODEL coef_df.to_csv(coeffile) if hasattr(clf.best_estimator_, "feature_importances_"): coef_sr = pd.Series(clf.best_estimator_.feature_importances_, name="Importance") coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature") coeffile = SUBMISSION + "importance_%s.csv" % MODEL coef_df.to_csv(coeffile) print "... start y_pred" test_X = fu_obj.transform(test_df) y_pred = clf.predict(test_X) pred_sr = pd.Series(y_pred, name="Sales", index=test_df["Id"]) submissionfile = SUBMISSION + "submission_%s.csv" % MODEL pred_sr.to_csv(submissionfile, header=True, index_label="ID")
def test_feature_union_weights(): # test feature union with transformer weights iris = load_iris() X = iris.data y = iris.target pca = RandomizedPCA(n_components=2, random_state=0) select = SelectKBest(k=1) # test using fit followed by transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # test using fit_transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) X_fit_transformed = fs.fit_transform(X, y) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)], transformer_weights={"mock": 10}) X_fit_transformed_wo_method = fs.fit_transform(X, y) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
def test_feature_union(): # basic sanity check for feature union iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target svd = TruncatedSVD(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("svd", svd), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input # We use a different svd object to control the random_state stream fs = FeatureUnion([("svd", svd), ("select", select)]) X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)]) X_transformed = fs.fit_transform(X, y) assert_equal(X_transformed.shape, (X.shape[0], 8))
def test_feature_stacker(): # basic sanity check for feature stacker iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target pca = RandomizedPCA(n_components=2) select = SelectKBest(k=1) fs = FeatureUnion([("pca", pca), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))
def test_same_result(self): X, Z = self.make_text_rdd(2) loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) loc_word = CountVectorizer(analyzer="word") dist_word = SparkCountVectorizer(analyzer="word") loc_union = FeatureUnion([ ("chars", loc_char), ("words", loc_word) ]) dist_union = SparkFeatureUnion([ ("chars", dist_char), ("words", dist_word) ]) # test same feature names loc_union.fit(X) dist_union.fit(Z) assert_equal( loc_union.get_feature_names(), dist_union.get_feature_names() ) # test same results X_transformed = loc_union.transform(X) Z_transformed = sp.vstack(dist_union.transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray()) # test same results with fit_transform X_transformed = loc_union.fit_transform(X) Z_transformed = sp.vstack(dist_union.fit_transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray()) # test same results in parallel loc_union_par = FeatureUnion([ ("chars", loc_char), ("words", loc_word) ], n_jobs=2) dist_union_par = SparkFeatureUnion([ ("chars", dist_char), ("words", dist_word) ], n_jobs=2) loc_union_par.fit(X) dist_union_par.fit(Z) X_transformed = loc_union_par.transform(X) Z_transformed = sp.vstack(dist_union_par.transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
class q5_feature_UNION(BaseEstimator, RegressorMixin, TransformerMixin): def __init__(self): self.q5_feature_UNION = FeatureUnion([('q2_mlm_KNN', q2_mlm_KNN()), ('q3_mlm_RIDGE', q3_mlm_RIDGE()), ('q4_mlm_RIDGE', q4_mlm_RIDGE())]) def transform(self, X): model_union = self.q5_feature_UNION.transform(X) prediction = np.asscalar(np.average(model_union)) return prediction
def fit_logreg(self): tokenize_sense = CachedFitTransform(Pipeline([ ('tokenize', Map(compose(tokenize, normalize_special, unescape))), ('normalize', MapTokens(normalize_elongations)), ]), self.memory) features = FeatureUnion([ # ('w2v_doc', ToCorporas(Pipeline([ # ('tokenize', MapCorporas(tokenize_sense)), # ('feature', MergeSliceCorporas(Doc2VecTransform(CachedFitTransform(Doc2Vec( # dm=0, dbow_words=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, # workers=16 # ), self.memory)))), # ]).fit([self.train_docs, self.unsup_docs[:10**6], self.val_docs, self.test_docs]))), # ('w2v_word_avg', Pipeline([ # ('tokenize', tokenize_sense), # ('feature', Word2VecAverage(CachedFitTransform(Word2Vec( # sg=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16 # ), self.memory))), # ]).fit(self.unsup_docs[:10**6])), # ('w2v_word_avg_google', Pipeline([ # ('tokenize', tokenize_sense), # ('feature', Word2VecAverage(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))), # ])), # ('w2v_word_norm_avg', Pipeline([ # ('tokenize', tokenize_sense), # ('feature', Word2VecNormAverage(CachedFitTransform(Word2Vec( # sg=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16 # ), self.memory))), # ]).fit(self.unsup_docs[:10**6])), ('w2v_word_norm_avg_google', Pipeline([ ('tokenize', tokenize_sense), ('feature', Word2VecNormAverage(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))), ])), # ('w2v_word_max', Pipeline([ # ('tokenize', tokenize_sense), # ('feature', Word2VecMax(CachedFitTransform(Word2Vec( # sg=1, size=100, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16 # ), self.memory))), # ]).fit(self.unsup_docs[:10**6])), # ('w2v_word_max_google', Pipeline([ # ('tokenize', tokenize_sense), # ('feature', Word2VecMax(joblib.load('data/google/GoogleNews-vectors-negative300.pickle'))), # ])), # ('w2v_word_inv', ToCorporas(Pipeline([ # ('tokenize', MapCorporas(tokenize_sense)), # ('feature', MergeSliceCorporas(Word2VecInverse(CachedFitTransform(Word2Vec( # sg=1, size=100, window=10, hs=0, negative=5, sample=0, min_count=1, iter=20, workers=16 # ), self.memory)))), # ]).fit([self.train_docs, self.unsup_docs[:10**5], self.val_docs, self.test_docs]))), ]) classifier = LogisticRegression() with temp_log_level({'gensim.models.word2vec': logging.INFO}): classifier.fit(features.transform(self.train_docs), self.train_labels()) estimator = Pipeline([('features', features), ('classifier', classifier)]) return 'logreg({})'.format(','.join(name for name, _ in features.transformer_list)), estimator
def prediction(train_df, test_df, MODEL): print "... start prediction" fu_obj = FeatureUnion(transformer_list=features.feature_list) train_df = train_df[(train_df["Open"] == 1) & (train_df["Sales"] > 0)] train_X = fu_obj.fit_transform(train_df) train_y = np.log1p(train_df["Sales"]).as_matrix() train_dump_df = pd.DataFrame(train_X, columns=get_split_feature_list(fu_obj)) train_dump_df["target"] = train_y train_dump_df = train_dump_df.dropna(axis=0) print train_dump_df.shape train_X = train_dump_df[get_split_feature_list(fu_obj)].values train_y = train_dump_df["target"].values train_dump_df["ID"] = -1 train_dump_df.to_csv(SUBMISSION + "train_dump.csv", index=False) test_X = fu_obj.transform(test_df) test_dump_df = pd.DataFrame(test_X, columns=get_split_feature_list(fu_obj)) print (test_dump_df == 0).sum(axis=0) test_dump_df["ID"] = test_df["Id"] test_dump_df.to_csv(SUBMISSION + "test_dump.csv", index=False) if MODEL == "XGB": train_X, valid_X, train_y, valid_y =\ train_test_split(train_X, train_y, test_size=0.05) fit_param = {"eval_set": [(train_X, train_y), (valid_X, valid_y)], "eval_metric": rmspe_xg, "early_stopping_rounds": 100} clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"], param_grid=clf_dict[MODEL]["paramteters"], n_jobs=3, scoring=rmspe, verbose=1, fit_params=fit_param) else: clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"], param_grid=clf_dict[MODEL]["paramteters"], n_jobs=3, scoring=rmspe, verbose=1) clf.fit(train_X, train_y) print clf.best_score_ index_sr = pd.Series(get_split_feature_list(fu_obj), name="Feature") if hasattr(clf.best_estimator_, "coef_"): coef_sr = pd.Series(clf.best_estimator_.coef_, name="Coef") coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature") coeffile = SUBMISSION + "coef_%s.csv" % MODEL coef_df.to_csv(coeffile) if hasattr(clf.best_estimator_, "feature_importances_"): coef_sr = pd.Series(clf.best_estimator_.feature_importances_, name="Importance") coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature") coeffile = SUBMISSION + "importance_%s.csv" % MODEL coef_df.to_csv(coeffile) print "... start y_pred" y_pred = np.expm1(clf.predict(test_X)) pred_sr = pd.Series(y_pred, name="Sales", index=test_df["Id"]) submissionfile = SUBMISSION + "submission_%s.csv" % MODEL pred_sr.to_csv(submissionfile, header=True, index_label="ID")
class MuscleClassifier(): def __init__(self, auto_load=True): """ Initializes our MuscleClassifier Option to preload it or start from fresh model """ #=====[ If auto_load, then we rehydrate our existing models ]===== if auto_load: self.model = pickle.load(open('modules/pickled/muscle_classifier.p','r')) self.le = pickle.load(open('modules/pickled/muscle_classifier_le.p','r')) self.vectorizer = pickle.load(open('modules/pickled/muscle_classifier_vectorizer.p','r')) else: self.model = BernoulliNB() def train(self, muscle_groups, labels): """ Vectorizes raw input and trains our classifier """ #=====[ Instantiate label encoder to turn text labels into ints ]===== self.le = preprocessing.LabelEncoder() #=====[ Declare vectorizers and merge them via a FeatureUnion ]===== char_vzr = feature_extraction.text.CountVectorizer(lowercase=True, ngram_range=(3,8), analyzer='char', encoding='utf-8') word_vzr = feature_extraction.text.CountVectorizer(lowercase=True, ngram_range=(1,5), analyzer='word', encoding='utf-8') self.vectorizer = FeatureUnion([('char',char_vzr),('word',word_vzr)]) #=====[ Transform our input and labels ]===== X = self.vectorizer.fit_transform(muscle_groups).toarray() Y = self.le.fit_transform(labels) #=====[ Fit our model and then run inference on training data ]===== self.model.fit(X,Y) y = self.model.predict(X) #=====[ Report Traning Accuracy ]===== print "Training Accuracy: %f " % (sum(y != Y)/float(len(Y))) def predict(self, exercises): """ Takes in raw input, vectorizes it, and reports back predicted muscle group """ X = self.vectorizer.transform(exercises).toarray() y = self.model.predict(X) return self.le.classes_[y]
def test_feature_stacker_weights(): # test feature stacker with transformer weights iris = load_iris() X = iris.data y = iris.target pca = RandomizedPCA(n_components=2) select = SelectKBest(k=1) fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # check against expected result assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel())
def test_feature_union(): # basic sanity check for feature union iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target svd = TruncatedSVD(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("svd", svd), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input # We use a different svd object to control the random_state stream fs = FeatureUnion([("svd", svd), ("select", select)]) X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # Test clone fs2 = assert_no_warnings(clone, fs) assert_false(fs.transformer_list[0][1] is fs2.transformer_list[0][1]) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", Transf()), ("svd", svd), ("select", select)]) X_transformed = fs.fit_transform(X, y) assert_equal(X_transformed.shape, (X.shape[0], 8)) # test error if some elements do not support transform assert_raises_regex(TypeError, 'All estimators should implement fit and ' 'transform.*\\bNoTrans\\b', FeatureUnion, [("transform", Transf()), ("no_transform", NoTrans())]) # test that init accepts tuples fs = FeatureUnion((("svd", svd), ("select", select))) fs.fit(X, y)
def test_reference_plusplus_legacy(self): """compare with reference result of original implementation""" image_list = ['./v1like_ref/sample_{}.png'.format(i) for i in range(10)] reference_result = loadmat('./v1like_ref/reference_v1like_result_plusplus.mat')['feature_matrix'] X = [imread(imagename) for imagename in image_list] v1like_instance_1 = v1like.V1Like(pars_baseline='simple_plus', legacy=True, debug=debug) v1like_instance_2 = v1like.V1Like(pars_baseline='simple_plusplus_2nd_scale', legacy=True, debug=debug) v1like_instance = FeatureUnion([('scale_1', v1like_instance_1), ('scale_2', v1like_instance_2)]) # seems that FeatureUnion's X can't be a iterator. must be a true array. with Timer('simple_plusplus legacy version'): result_legacy = v1like_instance.transform(X) self.assertEqual(reference_result.dtype, result_legacy.dtype) self.assertEqual(reference_result.shape, result_legacy.shape) if debug: print(abs(reference_result[:, :] - result_legacy[:, :]).max()) self.assertTrue(np.allclose(reference_result, result_legacy, atol=tol))
def test_feature_union_weights(): # test feature union with transformer weights iris = load_iris() X = iris.data y = iris.target pca = RandomizedPCA(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel())
def build_pipeline(): x_train, x_test, y_train, y_test = get_training_data() tfidf = TfidfVectorizer() feature_union = FeatureUnion( transformer_list=[ ('x', Pipeline([ ('selector', ItemSelector(key='x')), ('tfidf', tfidf), ('best', SelectKBest(k=1000)) ])) ]) X_features = feature_union.fit(x_train, y_train).transform(x_train) param_grid = dict(univ_select__k=[1,100,1000,10000], mnb__alpha=[0.01, 0.1, 1.0]) grid = GridSearchCV(MultinomialNB(), param_grid=param_grid) grid.fit(X_features, y_train) c = grid.best_estimator_ X_test = feature_union.transform(x_test) pred = np.array(c.predict(X_test)) pred_proba = np.array([a[1] for a in c.predict_proba(X_test)]) precision, recall, fscore, support = precision_recall_fscore_support(actual, pred) fpr, tpr, thresholds = roc_curve(actual, pred) auc_score = auc(fpr, tpr) now = datetime.datetime.now().strftime('%Y%m%d%H%M%S') metadata = { 'pipeline': str(grid.best_estimator_), 'created_at': now, 'git_hash': 0, 'precision': [float(p) for p in precision], 'recall': [float(r) for r in recall], 'fscore': [float(f) for f in fscore], 'support': [int(s) for s in support], 'auc': auc_score } p = PackagedPipeline(pipeline=grid.best_estimator_, feature_union=feature_union, metadata=metadata, x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test) p.save()
def validation_model(df, MODEL): print "... start validation" fu_obj = FeatureUnion(transformer_list=features.feature_list) train_df = df[(df["valflag"] != 1)] train_X = fu_obj.fit_transform(train_df) train_y = train_df["Sales"].as_matrix() clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"], param_grid=clf_dict[MODEL]["paramteters"], n_jobs=3, scoring=rmspe, cv=None) clf.fit(train_X, train_y) print clf.grid_scores_ print clf.best_estimator_ print clf.best_score_ print clf.best_params_ index_sr = pd.Series(get_split_feature_list(fu_obj), name="Feature") if hasattr(clf.best_estimator_, "coef_"): coef_sr = pd.Series(clf.best_estimator_.coef_, name="Coef") coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature") coeffile = SUBMISSION + "coef_%s_validation.csv" % MODEL coef_df.to_csv(coeffile) if hasattr(clf.best_estimator_, "feature_importances_"): coef_sr = pd.Series(clf.best_estimator_.feature_importances_, name="Importance") coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature") coeffile = SUBMISSION + "importance_%s_validation.csv" % MODEL coef_df.to_csv(coeffile) val_df = df[(df["valflag"] == 1)] test_X = fu_obj.transform(val_df) test_y = val_df["Sales"].as_matrix() y_pred = clf.predict(test_X) pred_sr = pd.Series(y_pred, name="Sales_Pred") y_sr = pd.Series(test_y, name="Sales") res = pd.concat([pred_sr, y_sr], axis=1).rename(index=lambda x: x + 1) submissionfile = SUBMISSION + "submission_validation_%s.csv" % MODEL res.to_csv(submissionfile)
class PerClassFeatureSelector: """ """ def __init__(self,*transformers): self.transformers=transformers self.transformer=None def fit(self,X,y): feature_logger.info("Fitting transformers for each class") #Get all the classes first genre_set=set((normalize_genre_string(g,1) for g in y)) #stage 1 transformer_list=[] #list of all the transformers for each class/genre for g in genre_set: feature_logger.info("Fitting transformer for {}".format(g)) transformer_obj=copy.deepcopy(self.transformers[0]) genre_matches=[g == normalize_genre_string(g_1,1) for g_1 in y] #X_match=X[np.array(genre_matches)] #y_match=y[np.array(genre_matches)] transformer_obj.fit(X,genre_matches) transformer_list.append((g,transformer_obj)) #now train the actual transformer self.transformer=FeatureUnion(transformer_list,1) def transform(self,X): return self.transformer.transform(X) def fit_transform(self,X,y): self.fit(X,y) return self.transform(X)
def test_same_result_weight(self): X, Z = self.make_text_rdd(2) loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) loc_word = CountVectorizer(analyzer="word") dist_word = SparkCountVectorizer(analyzer="word") loc_union = FeatureUnion([ ("chars", loc_char), ("words", loc_word) ], transformer_weights={"words": 10}) dist_union = SparkFeatureUnion([ ("chars", dist_char), ("words", dist_word) ], transformer_weights={"words": 10}) loc_union.fit(X) dist_union.fit(Z) X_transformed = loc_union.transform(X) Z_transformed = sp.vstack(dist_union.transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
return self.scaler.transform(float_df) class SelectCategoryVars(BaseEstimator, TransformerMixin): def fit(self, X, y=None): return self def transform(self, X, y=None): res=[] for item in X.columns: if item not in FLOAT_VARS: res.append(item) return np.array(X.loc[:,res]) combine_feature = FeatureUnion([("scalingfloat", ScalingFloat()), ("selectcategory", SelectCategoryVars())]) # combine_feature = FeatureUnion([("scalingfloat", ScalingFloat())]) # combine_feature = FeatureUnion([("selectcategory", SelectCategoryVars())]) combine_feature.fit(train) train = combine_feature.transform(train) test = combine_feature.transform(test) # print train.shape lrc = GridSearchCV(LogisticRegression(), param_grid = dict(C = C, \ penalty=penalty, intercept_scaling=intercept_scaling), cv = 10) lrc.fit(train, y_train) y_train_predict = lrc.predict(train) y_test_predict = lrc.predict(test) #scaled results: it seems that though RF has higher accuracy, but logistic regression has no overfitting, which can be a huge advantage # print 'the best parameter setting is:', lrc.best_estimator_ # # the best parameter setting is: LogisticRegression(C=9, class_weight=None, dual=False, fit_intercept=True, # # intercept_scaling=3, penalty='l1', random_state=None, tol=0.0001) # print 'the best CV score in the GridSearchCV is:', lrc.best_score_
class Agent(object): def __init__(self, num_actions, gamma=0.98, memory_size=5000, batch_size=32): self.scaler = None self.featurizer = None self.q_functions = None self.gamma = gamma self.batch_size = batch_size self.num_actions = num_actions self.memory = ReplayMemory(memory_size) self.initialize_model() def initialize_model(self): # Draw some samples from the observation range and initialize the scaler obs_limit = np.array([4.8, 5, 0.5, 5]) samples = np.random.uniform(-obs_limit, obs_limit, (1000, obs_limit.shape[0])) self.scaler = StandardScaler() self.scaler.fit(samples) # Initialize the RBF featurizer self.featurizer = FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=100)), ("rbf2", RBFSampler(gamma=2.0, n_components=80)), ("rbf3", RBFSampler(gamma=1.0, n_components=50)), ]) self.featurizer.fit(self.scaler.transform(samples)) # Create a value approximator for each action self.q_functions = [ SGDRegressor(learning_rate="constant", max_iter=500, tol=1e-3) for _ in range(self.num_actions) ] # Initialize it to whatever values; implementation detail for q_a in self.q_functions: q_a.partial_fit(self.featurize(samples), np.zeros((samples.shape[0], ))) def featurize(self, state): if len(state.shape) == 1: state = state.reshape(1, -1) # Task 1a: TODO: Use (s, abs(s)) as features # feature = np.append(state,np.abs(state)) # Task 1b: RBF features # return np.hstack((state,np.abs(state))) return self.featurizer.transform(self.scaler.transform(state)) def get_action(self, state, epsilon=0.0): if np.random.random() < epsilon: a = int(np.random.random() * self.num_actions) return a else: featurized = self.featurize(state) qs = [q.predict(featurized)[0] for q in self.q_functions] qs = np.array(qs) a = np.argmax(qs, axis=0) return a def single_update(self, state, action, next_state, reward, done): # Calculate feature representations of the # Task 1: TODO: Set the feature state and feature next state featurized_state = self.featurize(state) featurized_next_state = self.featurize(next_state) # Task 1: TODO Get Q(s', a) for the next state next_qs = np.array( max([ q_a.predict(featurized_next_state) for q_a in self.q_functions ])) if done: next_qs = np.zeros(1) # Calculate the updated target Q- values # Task 1: TODO: Calculate target based on rewards and next_qs target = reward + self.gamma * next_qs # Update Q-value estimation self.q_functions[action].partial_fit(featurized_state, target) def update_estimator(self): if len(self.memory) < self.batch_size: # Use the whole memory samples = self.memory.memory else: # Sample some data samples = self.memory.sample(self.batch_size) # Task 2: TODO: Reformat data in the minibatch states = np.array([sample.state for sample in samples]) action = np.array([sample.action for sample in samples]) next_states = np.array([sample.next_state for sample in samples]) rewards = np.array([sample.reward for sample in samples]) dones = np.array([sample.done for sample in samples]) # Task 2: TODO: Calculate Q(s', a) featurized_next_states = self.featurize(next_states) next_qs = np.array( [q_a.predict(featurized_next_states) for q_a in self.q_functions]) next_qs = np.max(next_qs, axis=0) idx = dones == True if np.any(idx): next_qs[idx] = 0 # Calculate the updated target values # Task 2: TODO: Calculate target based on rewards and next_qs targets = rewards + self.gamma * next_qs # Calculate featurized states featurized_states = self.featurize(states) # Get new weights for each action separately for a in range(self.num_actions): # Find states where a was taken idx = action == a # If a not present in the batch, skip and move to the next action if np.any(idx): act_states = featurized_states[idx] act_targets = targets[idx] # Perform a single SGD step on the Q-function params self.q_functions[a].partial_fit(act_states, act_targets) def store_transition(self, *args): self.memory.push(*args)
dataset = pd.DataFrame(data=catconversion.fit_transform(orig_dataset), columns=fcols, index=orig_dataset.index) target = FeatureColumnsExtractor( settings.TARGET).fit_transform(orig_dataset).apply(nonlinearity) import time before = time.time() pipeline = overall_pipeline() cv = KFold(len(target), n_folds=4, random_state=2, shuffle=False) submission = SqrtHazardSubmission(pipeline, 'XGB_Direct_OneHot', cv=cv) submission.fit(dataset, target, perform_cv=True, scoring=scorer_normalized_gini, n_jobs=2, verbose=3) # print('fitted. time:', time.time() - before) original_test_set = pd.read_csv(settings.TEST_FILE) test_set = pd.DataFrame(data=catconversion.transform(original_test_set), columns=fcols, index=original_test_set.index) predictions = submission.predict(test_set) submission.create_submission(predictions, original_test_set, settings.SUBMIT_MY_XGB_DIRECT_ONE_HOT)
new_data = get_data_frame( new_data_directory, lambda line: json.loads(line), extension =".timetest") y_train = data_train.target vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.3, stop_words='english') content_pipeline = FeaturePipeline([ ('cont1', TextExtractor('content')), ('vec', vectorizer), ]) colloc_pipeline = FeaturePipeline([ ('cont1', TextExtractor('content')), ('coll', ChiSqBigramFinder(score_thr=70)), ('vectc', FeatureHasher(input_type="string", non_negative=True)) ]) preprocess = FeatureUnion([ ('cp', content_pipeline), ('op', colloc_pipeline) ]) X_train = preprocess.fit_transform(data_train.data) X_new = preprocess.transform(new_data.data) model = LinearSVC(loss='l2',penalty='l2',tol=1e-3) trained_model = model.fit(X_train,y_train) stuff = trained_model.decision_function(X_new)
scaler = preprocessing.StandardScaler().fit(X_train_raw) X_train_scaled = scaler.transform(X_train_raw) X_test_scaled = scaler.transform(X_test_raw) ## PCA and Feature Selection '''pca = PCA(n_components=100) pca.fit(X_train_scaled) #print(pca.explained_variance_ratio_) X_train_reduced = pca.transform(X_train_scaled) X_test_reduced = pca.transform(X_test_scaled) ''' pca = PCA(n_components=800) selection = SelectKBest(k=850) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) combined_features.fit(X_train_scaled, train_labels.ravel()) #print(pca.explained_variance_ratio_) X_train_reduced = combined_features.transform(X_train_scaled) X_test_reduced = combined_features.transform(X_test_scaled) ## Train final Classifiers #clf = Ridge(alpha=.5) clf = Lasso(alpha=.03) clf.fit(X_train_reduced, Y_train_raw) Y_predicted = clf.predict(X_test_reduced) ## Save results to csv np.savetxt('prediction.csv', Y_predicted, fmt='%.5f',delimiter=',')
("cat_pipeline", cat_pipeline), ]) # In[89]: housing_prepared = preparation_pipeline.fit_transform(housing) #print(housing_prepared) #Linear Regression lin_reg = LinearRegression() lin_reg.fit(housing_prepared, housing_labels) some_data = housing.iloc[:5] some_labels = housing_labels.iloc[:5] some_data_prepared = preparation_pipeline.transform(some_data) print("Predictions:\t", lin_reg.predict(some_data_prepared)) print("Labels:\t\t", list(some_labels)) from sklearn.metrics import mean_squared_error housing_predictions = lin_reg.predict(housing_prepared) lin_mse = mean_squared_error(housing_labels, housing_predictions) lin_rmse = np.sqrt(lin_mse) print(lin_rmse) from sklearn.metrics import mean_absolute_error lin_mae = mean_absolute_error(housing_labels, housing_predictions) print(lin_mae)
for i in lde_params: lde1=lde(p_dropout=i) # lde1.fit(X_train) # X_ae = lde1.transform(X_train) for j in lde_params: lde2=lde(p_dropout=j) # lde2.fit(X_ae) # X_ae = lde2.transform(X_ae) # # lde3=lde(p_dropout=0.15) print("For p_dropout = ",i , j) # eval_model(clf, X_ae, y, cv=3, n_jobs=-1) univ_selection=SelectKBest(k=52) combined_features = FeatureUnion([("lde1", lde1), ("lde2", lde2),("univ_selection",univ_selection)]) # Use combined features to transform dataset: combined_features.fit(X_train, y) X_features = combined_features.transform(X_train) eval_model(clf, X_features, y, cv=3, n_jobs=-1) print("linear AE done") # ae_params = {'p_dropout':[0.1, 0.3]} # estimator = GridSearchCV(p, cv = 2, # # param_grid = dict(lde1__p_dropout = lde_params,lde2__p_dropout = lde_params) # ) # estimator.fit(X_train, y) # print("best_estimator_ ",estimator.best_estimator_ ) # print("best_score_ ",estimator.best_score_ )
vectorizer_param = {'preprocessor': preprocessor, 'ngram_range': parameters['ngram_range'], 'analyzer': 'word', 'min_df': parameters['min_df'], 'max_df': parameters['max_df'], 'binary': parameters['TF_binary'], 'norm': parameters['norm'], 'sublinear_tf': parameters['sublinear_tf'], 'max_features': parameters['max_features']} if __name__ == "__main__": unigram = StemmedTfidfVectorizer(**vectorizer_param) anew = anew_vectorizer() pct = punctuation_estimator() strength = strength_vectorizer() avg_strength = avg_affective_vectorizer() log_state('combine unigram and avg strength features') combined_features = FeatureUnion([('unigram', unigram), ('avg_strength', avg_strength)]) # log_state('combine unigram and strength features') # combined_features =FeatureUnion([('unigram',unigram),('strength',strength)]) # log_state('combine unigram and anew features') # combined_features =FeatureUnion([('unigram',unigram),('anew',anew)]) # log_state('combine unigram and punctuation features') # combined_features =FeatureUnion([('unigram',unigram),('pct',pct)]) texts, _ = load_train_data('Sentiment140') transformed_train = combined_features.fit_transform(texts) testdata, _ = load_test_data() transformed_test = combined_features.transform(testdata) dump_picle(combined_features.get_feature_names(), './data/features/feature_names.p') dump_picle(transformed_train, "./data/transformed_data/transformed_train.p") dump_picle(transformed_test, "./data/transformed_data/transformed_test.p")
pipe_feature = FeatureUnion([('window_transformer', WindowTransformerList(searched_words=search_words, n_jobs=n_jobs, min_df=min_df)), ('bag_of_words', BagOfWordInLine(searched_words=search_words, n_jobs=n_jobs, min_df=min_df)), ('is_date', IsDate(n_jobs=n_jobs)), ("position", BoxPositionGetter()), ('is_digit', ContainsDigit(n_jobs=n_jobs)), ('is_nom', IsNom(n_jobs=n_jobs)), ('is_prenom', IsPrenom(n_jobs=n_jobs)), ]) X_train = pipe_feature.fit_transform(X_train) X_test = pipe_feature.transform(X_test) data = { "X_train": X_train, "y_train": y_train, "X_test": X_test, "y_test": y_test } from pickle import dump with open(data_output, 'wb') as f1: dump(data, f1) with open(pipe_feature_output, 'wb') as f2: dump(pipe_feature, f2)
class AuthorshipAttribution: """ Implements authorship attribution models.""" def __init__(self, data_set): self.corpus = [] self.book_labels = [] self.author_labels = [] self.tags = [] for item in data_set: self.corpus.append(item["text"]) self.book_labels.append(item["book"]) self.author_labels.append(item["author"]) self.tags.append(item["pos"]) self.sample_clf = None self.author_clf = None self.author_lms = {} # word ngram feature generators self.word_vector = TfidfVectorizer(analyzer="word", ngram_range=(2, 2), max_features=2000, binary=False, decode_error='ignore') # char ngram feature generators self.char_vector = TfidfVectorizer(analyzer="char", ngram_range=(2, 3), max_features=2000, binary=False, decode_error='ignore', min_df=0) # POS ngram feature generators self.tag_vector = TfidfVectorizer(analyzer="word", ngram_range=(2, 2), max_features=2000, binary=False, decode_error='ignore') # punctuation frequency feature generators self.punct_vector = TfidfVectorizer(analyzer='char', preprocessor=util.retain_punct, max_features=2000, binary=False, use_idf=False, decode_error='ignore') # concatenate features generators self.vectorizer = FeatureUnion([("chars", self.char_vector), ("words", self.word_vector), ("puncts", self.punct_vector)]) # generate features print "- Generating features" X1 = self.vectorizer.fit_transform(self.corpus) X2 = self.tag_vector.fit_transform(self.tags) # concatenate two feature matrices matrix = sp.hstack((X1, X2)) self.X = matrix.toarray() def generate_test_features(self, corpus, classes, tags): """Generate feature matrix of the test corpus passes as argument.""" X1 = self.vectorizer.transform(corpus) X2 = self.tag_vector.transform(tags) # concatenate two matrices matrix = sp.hstack((X1, X2)) X = matrix.toarray() y = np.asarray(classes) return (X, y) def train_sample_model(self): """Train classifier needed to predict the book using sample text.""" print "- Training book/work model" save_location = os.path.join("models", "clf", "sample_model.p") # check and load if a saved model is present, if not train a new one if os.path.isfile(save_location): self.sample_clf = pickle.load(open(save_location, "rb")) else: X_train, y_train = self.X, np.asarray(self.book_labels) model = SVC(kernel='rbf') self.sample_clf = model.fit(X_train, y_train) pickle.dump(self.sample_clf, open(save_location, "wb")) def train_author_model(self): """Train classifier needed to predict the author using sample text.""" print "- Training author model" save_location = os.path.join("models", "clf", "author_model.p") # check and load if a saved model is present, if not train a new one if os.path.isfile(save_location): self.author_clf = pickle.load(open(save_location, "rb")) else: X_train, y_train = self.X, np.asarray(self.author_labels) model = LinearSVC(loss='hinge', dual=True) self.author_clf = model.fit(X_train, y_train) pickle.dump(self.author_clf, open(save_location, "wb")) def train_lang_model(self): """Train language model needed to predict the next word.""" print "- Training language model" author_data = {} for author, book in zip(self.author_labels, self.corpus): save_location = os.path.join("models", "lm", author+".p") if os.path.isfile(save_location): continue else: if author in author_data: author_data[author] = author_data[author] + book else: author_data[author] = book print " - LM for: [", for author in set(self.author_labels): print author, save_location = os.path.join("models", "lm", author+".p") if os.path.isfile(save_location): lm = LangModel() lm.load(save_location) self.author_lms[author] = lm else: lm = LangModel() works = author_data[author] lm.train(works) self.author_lms[author] = lm lm.save(save_location) print " ]" def predict_word(self, context, author=None): """Predict next word. This first predicts the author if author is not passed as an argument, then predicts the next word using author's language model. """ if not author: author = self.recognize_author([context])[0] # print "- predicting word using {}'s language model.".format(author) author_lm = self.author_lms[author] return author_lm.predict(context) def recognize_sample(self, test_text, test_class=None, test_tags=None): """Interface to call the classifier and predict the work using sample text. """ if not test_tags: text_tags = [util.get_pos_tags(txt) for txt in test_text] else: text_tags = test_tags text_class = test_class if not (self.sample_clf): self.train_sample_model() X_test, y_test = self.generate_test_features(test_text, text_class, text_tags) y_pred = self.sample_clf.predict(X_test) return y_pred def recognize_author(self, test_text, test_class=None, test_tags=None): """Interface to call the classifier and predict the author using sample text. """ if not test_tags: text_tags = [util.get_pos_tags(txt) for txt in test_text] else: text_tags = test_tags text_class = test_class if not (self.author_clf): self.train_author_model() X_test, y_test = self.generate_test_features(test_text, text_class, text_tags) y_pred = self.author_clf.predict(X_test) return y_pred
def prediction(train_df, test_df, MODELS): print("...create feature") fu_obj = FeatureUnion(transformer_list=features.get_feature_list()) train_X = fu_obj.fit_transform(train_df, train_df["Response"]) train_y = train_df["Response"].as_matrix() train_dump_df = pd.DataFrame(train_X, columns=get_split_feature_list(fu_obj)) train_dump_df["target"] = train_y train_dump_df["ID"] = -1 train_dump_df.to_csv(SUBMISSION + "train_dump.csv", index=False) test_X = fu_obj.transform(test_df) test_dump_df = pd.DataFrame(test_X, columns=get_split_feature_list(fu_obj)) test_dump_df["ID"] = test_df["Id"] test_dump_df.to_csv(SUBMISSION + "test_dump.csv", index=False) oc_obj = oc.OptimCutPoint() oc_obj2 = oc.OptimCutPoint() oo_obj = oo.OptimOffset() oo_obj2 = oo.OptimOffset() oo_all_obj = oo.OptimOffset(True) oo_all_obj2 = oo.OptimOffset(True) model_list = MODELS.split(",") clf_list = [] valid_list = [] kf = KFold(len(train_X), random_state=7777, n_folds=5) print("...start fitting") for model in model_list: print("... start fit %s model" % model) valid_pred_list = [] valid_label_list = [] for train_index, test_index in kf: use_train_X = train_X[train_index] use_train_y = train_y[train_index] valid_X = train_X[test_index] valid_y = train_y[test_index] clf_dict[model]["paramteters"] if model == "XGB_REG" or model == "XGB_RANK" or model == "XGB_REG2": use_train_X, xgb_valid_X, use_train_y, xgb_valid_y =\ train_test_split(use_train_X, use_train_y, test_size=0.2) fit_param = {"eval_set": [(use_train_X, use_train_y), (xgb_valid_X, xgb_valid_y)], "early_stopping_rounds": 50 } f_clf = GridSearchCV(estimator=clf_dict[model]["clf"](), param_grid=clf_dict[model]["paramteters"], n_jobs=3, verbose=2, fit_params=fit_param) else: fit_param = {} f_clf = GridSearchCV(estimator=clf_dict[model]["clf"](), param_grid=clf_dict[model]["paramteters"], n_jobs=3, verbose=2) f_clf.fit(use_train_X, use_train_y) valid_pred_list.append(f_clf.predict(valid_X)) valid_label_list.append(valid_y) valid_list.append(np.concatenate(valid_pred_list)) concat_valid_y = (np.concatenate(valid_label_list)) use_train_X = np.copy(train_X) use_train_y = np.copy(train_y) if model == "XGB_REG" or model == "XGB_RANK" or model == "XGB_REG2": use_train_X, xgb_valid_X, use_train_y, xgb_valid_y =\ train_test_split(train_X, train_y, test_size=0.2) fit_param = {"eval_set": [(use_train_X, use_train_y), (xgb_valid_X, xgb_valid_y)], "early_stopping_rounds": 50 } clf = GridSearchCV(estimator=clf_dict[model]["clf"](), param_grid=clf_dict[model]["paramteters"], n_jobs=3, verbose=1, fit_params=fit_param) else: fit_param = {} clf = GridSearchCV(estimator=clf_dict[model]["clf"](), param_grid=clf_dict[model]["paramteters"], n_jobs=3, verbose=2) clf.fit(use_train_X, use_train_y) clf_list.append(clf) print("... start optim cutting") if len(clf_list) > 1: test_predict_list = [c.predict(test_X) for c in clf_list] valid_predict_X = np.c_[valid_list].T test_predict_X = np.c_[test_predict_list].T linear_reg = sklearn.linear_model.LinearRegression() linear_reg.fit(valid_predict_X, concat_valid_y) print(linear_reg.intercept_) print(linear_reg.coef_) valid_ave_predict = valid_predict_X.mean(axis=1)[None].T valid_predict = linear_reg.predict(valid_predict_X)[None].T test_ave_predict = test_predict_X.mean(axis=1)[None].T test_predict = linear_reg.predict(test_predict_X)[None].T else: use_clf = clf_list[0] concat_valid_y = train_y valid_predict = use_clf.predict(train_X)[None].T test_predict = use_clf.predict(test_X)[None].T print("...start y_pred") oo_obj.fit(valid_predict, concat_valid_y) oo_all_obj.fit(valid_predict, concat_valid_y) oc_obj.fit(valid_predict, concat_valid_y) y_pred = oo_obj.transform(test_predict) pred_sr = pd.Series(y_pred, name="Response", index=test_df["Id"]) submissionfile = SUBMISSION + "submission_offset_%s.csv" % MODELS pred_sr.to_csv(submissionfile, header=True, index_label="ID") y_pred = oo_all_obj.transform(test_predict) pred_sr = pd.Series(y_pred, name="Response", index=test_df["Id"]) submissionfile = SUBMISSION + "submission_offset_all_%s.csv" % MODELS pred_sr.to_csv(submissionfile, header=True, index_label="ID") y_pred = oc_obj.transform(test_predict) pred_sr = pd.Series(y_pred, name="Response", index=test_df["Id"]) submissionfile = SUBMISSION + "submission_cutpoint_%s.csv" % MODELS pred_sr.to_csv(submissionfile, header=True, index_label="ID") if len(clf_list) > 1: oo_obj2.fit(valid_ave_predict, concat_valid_y) oo_all_obj2.fit(valid_predict, concat_valid_y) y_pred2 = oo_obj2.transform(test_ave_predict) pred_sr2 = pd.Series(y_pred2, name="Response", index=test_df["Id"]) submissionfile = SUBMISSION + "submission_offset_ave_%s.csv" % MODELS pred_sr2.to_csv(submissionfile, header=True, index_label="ID") y_pred2 = oo_all_obj2.transform(test_ave_predict) pred_sr2 = pd.Series(y_pred2, name="Response", index=test_df["Id"]) submissionfile = SUBMISSION + "submission_offset_all_ave_%s.csv" % MODELS pred_sr2.to_csv(submissionfile, header=True, index_label="ID") oc_obj2.fit(valid_ave_predict, concat_valid_y) y_pred2 = oc_obj2.transform(test_ave_predict) pred_sr2 = pd.Series(y_pred2, name="Response", index=test_df["Id"]) submissionfile = SUBMISSION + "submission_cutpoint_ave_%s.csv" % MODELS pred_sr2.to_csv(submissionfile, header=True, index_label="ID") print("... finish y_pred")
Xtrain = vectorizer.fit_transform(Xtrain) print('Shape of Xtrain:', Xtrain.shape) print('Numerifying labels...') le = LabelEncoder() # Fit label encoder on Y for now, not Ytrain, to ensure we really 'know' all labels in the val set (This should usually be guaranteed by task) le.fit(Y) Ytrain = le.transform(Ytrain) print('Shape of Ytrain:', Ytrain.shape) print('Fitting SVM ...') clf = LinearSVC(random_state=0) clf.fit(Xtrain, Ytrain) print('Predicting...') Yguess_svm = clf.predict(vectorizer.transform(Xtest)) # Transform Yguess back to nominal labels Yguess_svm = le.inverse_transform(Yguess_svm) # Evaluate on val set print() print('*' * 50) print('Results for SVM baseline:') evaluate(Ytest, Yguess_svm) print('*' * 50) ''' # classifier_svm = Pipeline([('vec', vectorizer), # ('classify', SVC(kernel=Kernel, C=C_val))]) X_mat = vectorizer.fit_transform(X) print('shape of X_mat:', X_mat.shape)