def test_feature_stacker(): # basic sanity check for feature stacker iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target pca = RandomizedPCA(n_components=2) select = SelectKBest(k=1) fs = FeatureUnion([("pca", pca), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))
def test_set_feature_union_step_none(): mult2 = Mult(2) mult2.get_feature_names = lambda: ['x2'] mult3 = Mult(3) mult3.get_feature_names = lambda: ['x3'] X = np.asarray([[1]]) ft = FeatureUnion([('m2', mult2), ('m3', mult3)]) assert_array_equal([[2, 3]], ft.fit(X).transform(X)) assert_array_equal([[2, 3]], ft.fit_transform(X)) assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names()) ft.set_params(m2=None) assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert_equal(['m3__x3'], ft.get_feature_names()) ft.set_params(m3=None) assert_array_equal([[]], ft.fit(X).transform(X)) assert_array_equal([[]], ft.fit_transform(X)) assert_equal([], ft.get_feature_names()) # check we can change back ft.set_params(m3=mult3) assert_array_equal([[3]], ft.fit(X).transform(X))
def test_feature_union(): # basic sanity check for feature union iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target svd = TruncatedSVD(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("svd", svd), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input # We use a different svd object to control the random_state stream fs = FeatureUnion([("svd", svd), ("select", select)]) X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)]) X_transformed = fs.fit_transform(X, y) assert_equal(X_transformed.shape, (X.shape[0], 8))
def test_feature_union_weights(): # test feature union with transformer weights iris = load_iris() X = iris.data y = iris.target pca = RandomizedPCA(n_components=2, random_state=0) select = SelectKBest(k=1) # test using fit followed by transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # test using fit_transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) X_fit_transformed = fs.fit_transform(X, y) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)], transformer_weights={"mock": 10}) X_fit_transformed_wo_method = fs.fit_transform(X, y) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
def train_model(trainset, testset): word_vector = TfidfVectorizer(analyzer="word", ngram_range=(2,2), binary = False, max_features= 2000,min_df=1,decode_error="ignore") # print word_vector # print "works fine" char_vector = TfidfVectorizer(ngram_range=(2,3), analyzer="char", binary = False, min_df = 1, max_features = 2000,decode_error= "ignore") vectorizer =FeatureUnion([ ("chars", char_vector),("words", word_vector) ]) corpus = [] classes = [] testclasses = [] testcorpus = [] for item in trainset: corpus.append(item['text']) classes.append(item['label']) for item in testset: testcorpus.append(item['text']) testclasses.append(item['label']) # print "Training instances : ", len(classes) # print "Testing instances : ", len(set(classes)) matrix = vectorizer.fit_transform(corpus) testmatrix = vectorizer.fit_transform(testcorpus) # print "feature count :. ", len(vectorizer.get_feature_names()) # print "training model" X = matrix.toarray() TX = testmatrix.toarray() Ty= numpy.asarray(testclasses) y = numpy.asarray(classes) X_train, X_test, y_train, y_test= train_test_split(X,y,train_size=0.9999,test_size=.00001,random_state=0) model = LinearSVC(dual=True, loss='l1') # model = SVC() # model = NuSVC() # model = RandomForestClassifier() #scores=cross_validation.cross_val_score(model,X,y) #print "Accuracy "+ str(scores.mean()) # print y_pred y_prob = model.fit(X_train, y_train).predict(TX) # y_prob = OneVsRestClassifier(model).fit(X_train, y_train).predict(X_test) # print(y_prob) # cm = confusion_matrix(y_test, y_pred) # cr = classification_report(y_test, y_pred) # print cr # print(cm) # pl.matshow() # pl.title('Confusion matrix#') # pl.colorbar() # pl.ylabel('True label') # pl.xlabel('Predicted label') # pl.show() print accuracy_score(y_prob,Ty)
def make_checkdata(mode="df"): fu = FeatureUnion(transformer_list=f.feature_transformer_rule) Std = preprocessing.StandardScaler() _, _, _, train_gray_data, test_gray_data, _, labels = i_p.load_data() train_keys = train_gray_data.keys()[:2] train_inputs = {} train_labels = {} for i in xrange(len(train_keys)): input_ = train_gray_data[train_keys[i]] label = labels[train_keys[i]] train_inputs.update({train_keys[i]:input_}) train_labels.update({train_keys[i]:label}) test_keys = test_gray_data.keys()[:2] test_inputs = {} for i in xrange(len(test_keys)): input_ = test_gray_data[test_keys[i]] test_inputs.update({test_keys[i]:input_}) train_df = f.make_data_df(train_inputs, train_labels) test_df = f.make_test_df(test_inputs) if mode == "df": train_df = train_df.reset_index() test_df = test_df.reset_index() train_df.columns = ["pngname", "input", "label"] test_df.columns = ["pngname", "input"] return train_df, train_keys, test_df, test_keys elif mode == "feature": X_train = fu.fit_transform(train_df) X_train = Std.fit_transform(X_train) y_train = np.concatenate(train_df["label"].apply(lambda x: x.flatten())) X_test = fu.fit_transform(test_df) X_test = Std.fit_transform(X_test) return X_train, y_train, X_test
def test_feature_union(): # basic sanity check for feature union iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target svd = TruncatedSVD(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("svd", svd), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input # We use a different svd object to control the random_state stream fs = FeatureUnion([("svd", svd), ("select", select)]) X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # Test clone fs2 = assert_no_warnings(clone, fs) assert_false(fs.transformer_list[0][1] is fs2.transformer_list[0][1]) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", Transf()), ("svd", svd), ("select", select)]) X_transformed = fs.fit_transform(X, y) assert_equal(X_transformed.shape, (X.shape[0], 8)) # test error if some elements do not support transform assert_raises_regex(TypeError, 'All estimators should implement fit and ' 'transform.*\\bNoTrans\\b', FeatureUnion, [("transform", Transf()), ("no_transform", NoTrans())]) # test that init accepts tuples fs = FeatureUnion((("svd", svd), ("select", select))) fs.fit(X, y)
def test_feature_union_parallel(): # test that n_jobs work for FeatureUnion X = JUNK_FOOD_DOCS fs = FeatureUnion([("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char"))]) fs_parallel = FeatureUnion( [("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char"))], n_jobs=2 ) fs_parallel2 = FeatureUnion( [("words", CountVectorizer(analyzer="word")), ("chars", CountVectorizer(analyzer="char"))], n_jobs=2 ) fs.fit(X) X_transformed = fs.transform(X) assert_equal(X_transformed.shape[0], len(X)) fs_parallel.fit(X) X_transformed_parallel = fs_parallel.transform(X) assert_equal(X_transformed.shape, X_transformed_parallel.shape) assert_array_equal(X_transformed.toarray(), X_transformed_parallel.toarray()) # fit_transform should behave the same X_transformed_parallel2 = fs_parallel2.fit_transform(X) assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray()) # transformers should stay fit after fit_transform X_transformed_parallel2 = fs_parallel2.transform(X) assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray())
def prediction(train_df, test_df, MODEL): print "... start prediction" fu_obj = FeatureUnion(transformer_list=features.feature_list) train_X = fu_obj.fit_transform(train_df) train_y = train_df["Sales"].as_matrix() clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"], param_grid=clf_dict[MODEL]["paramteters"], n_jobs=3, scoring=rmspe, verbose=1) clf.fit(train_X, train_y) print clf.best_score_ index_sr = pd.Series(get_split_feature_list(fu_obj), name="Feature") if hasattr(clf.best_estimator_, "coef_"): coef_sr = pd.Series(clf.best_estimator_.coef_, name="Coef") coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature") coeffile = SUBMISSION + "coef_%s.csv" % MODEL coef_df.to_csv(coeffile) if hasattr(clf.best_estimator_, "feature_importances_"): coef_sr = pd.Series(clf.best_estimator_.feature_importances_, name="Importance") coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature") coeffile = SUBMISSION + "importance_%s.csv" % MODEL coef_df.to_csv(coeffile) print "... start y_pred" test_X = fu_obj.transform(test_df) y_pred = clf.predict(test_X) pred_sr = pd.Series(y_pred, name="Sales", index=test_df["Id"]) submissionfile = SUBMISSION + "submission_%s.csv" % MODEL pred_sr.to_csv(submissionfile, header=True, index_label="ID")
def train_model(trainset): word_vector = TfidfVectorizer(analyzer="word", ngram_range=(2,2), binary = False, max_features= 2000,min_df=1,decode_error="ignore") # print word_vector print "works fine" char_vector = TfidfVectorizer(ngram_range=(2,3), analyzer="char", binary = False, min_df = 1, max_features = 2000,decode_error= "ignore") vectorizer =FeatureUnion([ ("chars", char_vector),("words", word_vector) ]) corpus = [] classes = [] for item in trainset: corpus.append(item['text']) classes.append(item['label']) print "Training instances : ", 0.8*len(classes) print "Testing instances : ", 0.2*len(classes) matrix = vectorizer.fit_transform(corpus) print "feature count : ", len(vectorizer.get_feature_names()) print "training model" X = matrix.toarray() y = numpy.asarray(classes) model =LinearSVC() X_train, X_test, y_train, y_test= train_test_split(X,y,train_size=0.8,test_size=.2,random_state=0) y_pred = OneVsRestClassifier(model).fit(X_train, y_train).predict(X_test) #y_prob = OneVsRestClassifier(model).fit(X_train, y_train).decision_function(X_test) #print y_prob #con_matrix = [] #for row in range(len(y_prob)): # temp = [y_pred[row]] # for prob in y_prob[row]: # temp.append(prob) # con_matrix.append(temp) #for row in con_matrix: # output.write(str(row)+"\n") #print y_pred #print y_test res1=[i for i, j in enumerate(y_pred) if j == 'anonEdited'] res2=[i for i, j in enumerate(y_test) if j == 'anonEdited'] reset=[] for r in res1: if y_test[r] != "anonEdited": reset.append(y_test[r]) for r in res2: if y_pred[r] != "anonEdited": reset.append(y_pred[r]) output=open(sys.argv[2],"w") for suspect in reset: output.write(str(suspect)+"\n") cm = confusion_matrix(y_test, y_pred) print(cm) pl.matshow(cm) pl.title('Confusion matrix') pl.colorbar() pl.ylabel('True label') pl.xlabel('Predicted label') pl.show() print accuracy_score(y_pred,y_test)
def convert_testdata(test_gray_data, feature_rule=f.feature_transformer_rule): data_df = f.make_test_df(test_gray_data) fu = FeatureUnion(transformer_list=feature_rule) Std = preprocessing.StandardScaler() X_test = fu.fit_transform(data_df) #X_test = Std.fit_transform(X_test) return X_test
def prediction(train_df, test_df, MODEL): print "... start prediction" fu_obj = FeatureUnion(transformer_list=features.feature_list) train_df = train_df[(train_df["Open"] == 1) & (train_df["Sales"] > 0)] train_X = fu_obj.fit_transform(train_df) train_y = np.log1p(train_df["Sales"]).as_matrix() train_dump_df = pd.DataFrame(train_X, columns=get_split_feature_list(fu_obj)) train_dump_df["target"] = train_y train_dump_df = train_dump_df.dropna(axis=0) print train_dump_df.shape train_X = train_dump_df[get_split_feature_list(fu_obj)].values train_y = train_dump_df["target"].values train_dump_df["ID"] = -1 train_dump_df.to_csv(SUBMISSION + "train_dump.csv", index=False) test_X = fu_obj.transform(test_df) test_dump_df = pd.DataFrame(test_X, columns=get_split_feature_list(fu_obj)) print (test_dump_df == 0).sum(axis=0) test_dump_df["ID"] = test_df["Id"] test_dump_df.to_csv(SUBMISSION + "test_dump.csv", index=False) if MODEL == "XGB": train_X, valid_X, train_y, valid_y =\ train_test_split(train_X, train_y, test_size=0.05) fit_param = {"eval_set": [(train_X, train_y), (valid_X, valid_y)], "eval_metric": rmspe_xg, "early_stopping_rounds": 100} clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"], param_grid=clf_dict[MODEL]["paramteters"], n_jobs=3, scoring=rmspe, verbose=1, fit_params=fit_param) else: clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"], param_grid=clf_dict[MODEL]["paramteters"], n_jobs=3, scoring=rmspe, verbose=1) clf.fit(train_X, train_y) print clf.best_score_ index_sr = pd.Series(get_split_feature_list(fu_obj), name="Feature") if hasattr(clf.best_estimator_, "coef_"): coef_sr = pd.Series(clf.best_estimator_.coef_, name="Coef") coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature") coeffile = SUBMISSION + "coef_%s.csv" % MODEL coef_df.to_csv(coeffile) if hasattr(clf.best_estimator_, "feature_importances_"): coef_sr = pd.Series(clf.best_estimator_.feature_importances_, name="Importance") coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature") coeffile = SUBMISSION + "importance_%s.csv" % MODEL coef_df.to_csv(coeffile) print "... start y_pred" y_pred = np.expm1(clf.predict(test_X)) pred_sr = pd.Series(y_pred, name="Sales", index=test_df["Id"]) submissionfile = SUBMISSION + "submission_%s.csv" % MODEL pred_sr.to_csv(submissionfile, header=True, index_label="ID")
def set_traindata(df, key): fu = FeatureUnion(transformer_list=f.feature_transformer_rule) Std = preprocessing.StandardScaler() X = fu.fit_transform(df) y = np.concatenate(df["label"].apply(lambda x: x.flatten())) X = Std.fit_transform(X) return (X, y)
def cv_score(train_df, MODEL): print "... start cross validation" fu_obj = FeatureUnion(transformer_list=features.feature_list) train_X = fu_obj.fit_transform(train_df) train_y = train_df["Sales"].as_matrix() clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"], param_grid=clf_dict[MODEL]["paramteters"], n_jobs=-1, scoring=rmspe, cv=None) print cross_val_score(clf, train_X, train_y, scoring=rmspe, cv=5, n_jobs=3)
def convert_traindata(train_gray_data, labels): data_df = f.make_data_df(train_gray_data, labels) fu = FeatureUnion(transformer_list=f.feature_transformer_rule) Std = preprocessing.StandardScaler() X_train = fu.fit_transform(data_df) y_train = np.concatenate(data_df["label"].apply(lambda x: x.flatten())) X_train = Std.fit_transform(X_train) return X_train, y_train
def get_data(): ''' get X, y data :rtype: tuple ''' _, _, _, train_gray_data, _, _, labels = i_p.load_data() data_df = f.make_data_df(train_gray_data, labels) fu = FeatureUnion(transformer_list=f.feature_transformer_rule) X = fu.fit_transform(data_df) y = np.concatenate(data_df["label"].apply(lambda x: x.flatten())) return (X, y)
class MuscleClassifier(): def __init__(self, auto_load=True): """ Initializes our MuscleClassifier Option to preload it or start from fresh model """ #=====[ If auto_load, then we rehydrate our existing models ]===== if auto_load: self.model = pickle.load(open('modules/pickled/muscle_classifier.p','r')) self.le = pickle.load(open('modules/pickled/muscle_classifier_le.p','r')) self.vectorizer = pickle.load(open('modules/pickled/muscle_classifier_vectorizer.p','r')) else: self.model = BernoulliNB() def train(self, muscle_groups, labels): """ Vectorizes raw input and trains our classifier """ #=====[ Instantiate label encoder to turn text labels into ints ]===== self.le = preprocessing.LabelEncoder() #=====[ Declare vectorizers and merge them via a FeatureUnion ]===== char_vzr = feature_extraction.text.CountVectorizer(lowercase=True, ngram_range=(3,8), analyzer='char', encoding='utf-8') word_vzr = feature_extraction.text.CountVectorizer(lowercase=True, ngram_range=(1,5), analyzer='word', encoding='utf-8') self.vectorizer = FeatureUnion([('char',char_vzr),('word',word_vzr)]) #=====[ Transform our input and labels ]===== X = self.vectorizer.fit_transform(muscle_groups).toarray() Y = self.le.fit_transform(labels) #=====[ Fit our model and then run inference on training data ]===== self.model.fit(X,Y) y = self.model.predict(X) #=====[ Report Traning Accuracy ]===== print "Training Accuracy: %f " % (sum(y != Y)/float(len(Y))) def predict(self, exercises): """ Takes in raw input, vectorizes it, and reports back predicted muscle group """ X = self.vectorizer.transform(exercises).toarray() y = self.model.predict(X) return self.le.classes_[y]
def test_same_result(self): X, Z = self.make_text_rdd(2) loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) loc_word = CountVectorizer(analyzer="word") dist_word = SparkCountVectorizer(analyzer="word") loc_union = FeatureUnion([ ("chars", loc_char), ("words", loc_word) ]) dist_union = SparkFeatureUnion([ ("chars", dist_char), ("words", dist_word) ]) # test same feature names loc_union.fit(X) dist_union.fit(Z) assert_equal( loc_union.get_feature_names(), dist_union.get_feature_names() ) # test same results X_transformed = loc_union.transform(X) Z_transformed = sp.vstack(dist_union.transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray()) # test same results with fit_transform X_transformed = loc_union.fit_transform(X) Z_transformed = sp.vstack(dist_union.fit_transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray()) # test same results in parallel loc_union_par = FeatureUnion([ ("chars", loc_char), ("words", loc_word) ], n_jobs=2) dist_union_par = SparkFeatureUnion([ ("chars", dist_char), ("words", dist_word) ], n_jobs=2) loc_union_par.fit(X) dist_union_par.fit(Z) X_transformed = loc_union_par.transform(X) Z_transformed = sp.vstack(dist_union_par.transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
def train_model(trainset): # create 2 blocks of features, word and character ngrams, size of 2 (using TF-IDF method) # we can also append here multiple other features in general word_vector = TfidfVectorizer( analyzer="word" , ngram_range=(2,2), binary = False, max_features= 2000 ) char_vector = TfidfVectorizer(ngram_range=(2, 3), analyzer="char", binary=False, min_df=0 , max_features=2000 ) # our vectors are the feature union of word/char ngrams vectorizer = FeatureUnion([ ("chars", char_vector),("words", word_vector) ] ) corpus, classes = [], [] for item in trainset: corpus.append( item['text'] ) classes.append( item['label'] ) print "num of training instances: ", len(classes) print "num of training classes: ", len(set(classes)) #fit the model of tfidf vectors for the coprus matrix = vectorizer.fit_transform(corpus) print "num of features: " , len(vectorizer.get_feature_names()) print "training model" X = matrix.toarray() y = np.asarray(classes) print X[0] # Here are results of several different models for Law corpus: # model = SVC(kernel='sigmoid') # -> 0.38 # model = KNeighborsClassifier(algorithm = 'kd_tree') # -> 0.41 # model = AdaBoostClassifier() #-> 0.46 # model = RandomForestClassifier() # -> 0.52 # model = LogisticRegression() # -> 0.65 model = LinearSVC( loss='l1', dual=True) # -> 0.70 # Results of several different models for Enron corpus: # model = LinearSVC( loss='l1', dual=True) # -> 0.6 scores = cross_validation.cross_val_score( estimator = model, X = matrix.toarray(), y= np.asarray(classes), cv=10 ) print "10-fold cross-validation results:", "mean score = ", scores.mean(), "std=", scores.std(), ", num folds =", len(scores)
def __init__(self, env, n_components=500): observation_examples = np.array([env.observation_space.sample() for x in range(10000)]) scaler = StandardScaler() scaler.fit(observation_examples) # Used to converte a state to a featurizes represenation. # We use RBF kernels with different variances to cover different parts of the space featurizer = FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=n_components)), ("rbf2", RBFSampler(gamma=2.0, n_components=n_components)), ("rbf3", RBFSampler(gamma=1.0, n_components=n_components)), ("rbf4", RBFSampler(gamma=0.5, n_components=n_components)) ]) example_features = featurizer.fit_transform(scaler.transform(observation_examples)) self.dimensions = example_features.shape[1] self.scaler = scaler self.featurizer = featurizer
def _build_bow(self, data_train): fu_bow = FeatureUnion( transformer_list=[ ('names', MultiSourceTfidfVectorizer(item_name='names', tokenizer=tokenizer_for_names, lowercase=False)), ('content', MultiSourceTfidfVectorizer(item_name='content', tokenizer=tokenizer, lowercase=False)), # ('title', Pipeline([ # ('selector', ItemSelector(key='title')), # ('tfidf', TfidfVectorizer(tokenizer=tokenizer, lowercase=False)), # ]) # ), ], transformer_weights={ 'names': 1.0, # 'title': 0.8, 'content': 0.6, }, ) featured_data = fu_bow.fit_transform(data_train) return fu_bow, featured_data
def set_validdata(df, keys): fu = FeatureUnion(transformer_list=f.feature_transformer_rule) Std = preprocessing.StandardScaler() for i in xrange(len(keys)): if i == 0: valid_df = df[(df["pngname"] == keys[i])] else: valid_df = pd.concat([valid_df, df[(df["pngname"] == keys[i])]]) valid_df = valid_df.drop("pngname", axis=1).reset_index() X = fu.fit_transform(valid_df) y = np.concatenate(valid_df["label"].apply(lambda x: x.flatten())) X = Std.fit_transform(X) return (X, y)
def validation_model(df, MODEL): print "... start validation" fu_obj = FeatureUnion(transformer_list=features.feature_list) train_df = df[(df["valflag"] != 1)] train_X = fu_obj.fit_transform(train_df) train_y = train_df["Sales"].as_matrix() clf = GridSearchCV(estimator=clf_dict[MODEL]["clf"], param_grid=clf_dict[MODEL]["paramteters"], n_jobs=3, scoring=rmspe, cv=None) clf.fit(train_X, train_y) print clf.grid_scores_ print clf.best_estimator_ print clf.best_score_ print clf.best_params_ index_sr = pd.Series(get_split_feature_list(fu_obj), name="Feature") if hasattr(clf.best_estimator_, "coef_"): coef_sr = pd.Series(clf.best_estimator_.coef_, name="Coef") coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature") coeffile = SUBMISSION + "coef_%s_validation.csv" % MODEL coef_df.to_csv(coeffile) if hasattr(clf.best_estimator_, "feature_importances_"): coef_sr = pd.Series(clf.best_estimator_.feature_importances_, name="Importance") coef_df = pd.concat([index_sr, coef_sr], axis=1).set_index("Feature") coeffile = SUBMISSION + "importance_%s_validation.csv" % MODEL coef_df.to_csv(coeffile) val_df = df[(df["valflag"] == 1)] test_X = fu_obj.transform(val_df) test_y = val_df["Sales"].as_matrix() y_pred = clf.predict(test_X) pred_sr = pd.Series(y_pred, name="Sales_Pred") y_sr = pd.Series(test_y, name="Sales") res = pd.concat([pred_sr, y_sr], axis=1).rename(index=lambda x: x + 1) submissionfile = SUBMISSION + "submission_validation_%s.csv" % MODEL res.to_csv(submissionfile)
def get_features(Xdata, y=None, ncomp=2, kbest=0): """Feature selection using PCA or Kbest variance selection""" if ncomp > 0 and kbest > 0: pca = PCA(n_components=ncomp) selection = SelectKBest(f_classif, k=( int(kbest) if int(kbest) < Xdata.shape[1] else 'all')) combined_features = FeatureUnion( [("pca", pca), ("univ_select", selection)]) X_features = combined_features.fit_transform(Xdata, y) elif ncomp > 0: pca = PCA(n_components=ncomp) X_features = pca.fit_transform(Xdata, y) elif kbest > 0: selection = SelectKBest(k=int(kbest) if int( kbest) < Xdata.shape[1] else 'all') X_features = selection.fit_transform(Xdata, y) return X_features
def __init__(self, env, n_components=500): observation_examples = np.array( [env.observation_space.sample() for x in range(10000)]) scaler = StandardScaler() scaler.fit(observation_examples) # Used to converte a state to a featurizes represenation. # We use RBF kernels with different variances to cover different parts of the space featurizer = FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=n_components)), ("rbf2", RBFSampler(gamma=2.0, n_components=n_components)), ("rbf3", RBFSampler(gamma=1.0, n_components=n_components)), ("rbf4", RBFSampler(gamma=0.5, n_components=n_components)) ]) example_features = featurizer.fit_transform( scaler.transform(observation_examples)) self.dimensions = example_features.shape[1] self.scaler = scaler self.featurizer = featurizer
def preprocessing_data(data): num_pipeline = Pipeline([ ("numetic", DataFrameSelector(["Age", "SibSp", 'Parch', "Fare"])), ("imputer", SimpleImputer(strategy='median')), # ("std_scaler", StandardScaler()), ]) # num_pipeline.fit_transform(data) cat_pipeline = Pipeline([ ("select_cat", DataFrameSelector(["Sex", "Pclass", "Embarked"])), ("imputer", MostFrequentImputer()), ("cat_encoder", OneHotEncoder(sparse=False)) ]) # cat_pipeline.fit_transform(data) preprocess_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline), ]) x_data = preprocess_pipeline.fit_transform(data) return x_data
def __init__(self, env): # observation_examples = np.array([env.observation_space.sample() for x in range(10000)]) # NOTE!! state samples are poor, b/c you get velocities --> infinity observation_examples = np.random.random((20000, 4))*2 - 1 scaler = StandardScaler() scaler.fit(observation_examples) # Used to converte a state to a featurizes represenation. # We use RBF kernels with different variances to cover different parts of the space featurizer = FeatureUnion([ ("rbf1", RBFSampler(gamma=0.05, n_components=1000)), ("rbf2", RBFSampler(gamma=1.0, n_components=1000)), ("rbf3", RBFSampler(gamma=0.5, n_components=1000)), ("rbf4", RBFSampler(gamma=0.1, n_components=1000)) ]) feature_examples = featurizer.fit_transform(scaler.transform(observation_examples)) self.dimensions = feature_examples.shape[1] self.scaler = scaler self.featurizer = featurizer
def test_sklearn_pipeline_works(lang): pipe = Pipeline([("embed", lang), ("model", LogisticRegression())]) X = [ "i really like this post", "thanks for that comment", "i enjoy this friendly forum", "this is a bad post", "this is a bad post", "i dislike this article", "this is not well written", ] y = np.array([1, 1, 1, 0, 0, 0, 0]) pipe.fit(X, y) assert pipe.predict(X).shape[0] == 7 preprocess = FeatureUnion([("dense", lang), ("sparse", CountVectorizer())]) assert preprocess.fit_transform(X).shape[0] == 7
def test_explain_prediction_feature_union_sparse(newsgroups_train_binary): # FeatureUnion with sparce features and text highlighting docs, ys, target_names = newsgroups_train_binary vec = FeatureUnion([ ('word', CountVectorizer(stop_words='english')), ('char', CountVectorizer(ngram_range=(3, 3))), ]) clf = XGBClassifier(n_estimators=100, max_depth=2, missing=0) xs = vec.fit_transform(docs) clf.fit(xs, ys) res = explain_prediction(clf, 'computer graphics in space: a sign of atheism', vec=vec, target_names=target_names) format_as_all(res, clf) check_targets_scores(res) weights = res.targets[0].feature_weights pos_features = get_all_features(weights.pos) assert 'word__graphics' in pos_features assert res.targets[0].weighted_spans
def pipeline(housing): #handing text attributes housing_num = housing.drop("ocean_proximity", axis=1) num_attribs = list(housing_num) cat_attribs = ["ocean_proximity"] num_pipeline = Pipeline([ ('selector', DataFrameSelector(num_attribs)), ('imputer', Imputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) cat_pipeline = Pipeline([ ('selector', DataFrameSelector(cat_attribs)), ('label_binarizer', LabelBinarizer()), ]) full_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline), ]) housing_prepared = full_pipeline.fit_transform(housing)
def __init__(self, env): observation_examples = np.array( [env.observation_space.sample() for x in range(10000)]) ## Standardize the observations so we have mean 0 and variance 1 scaler = StandardScaler() scaler.fit(observation_examples) ## Converts the state into a feature representations ## We use the rbf kernel with different variances to cover different parts of the space featurizer = FeatureUnion([ ('rbf1', RBFSampler(gamma=5.0, n_components=500)), ('rbf2', RBFSampler(gamma=2.0, n_components=500)), ('rbf3', RBFSampler(gamma=1.0, n_components=500)), ('rbf4', RBFSampler(gamma=0.5, n_components=500)) ]) example_features = featurizer.fit_transform( scaler.transform(observation_examples)) self.dimensions = example_features.shape[1] self.scaler = scaler self.featurizer = featurizer
def __init__(self, env): # observation_examples = np.array([env.observation_space.sample() for x in range(10000)]) # sampling state has issues, velocities --> infinity, # so we created random the space, # [(-1, 1), (-1, 1),..] observation_examples = np.random.random((20000, 4)) * 2 - 1 scaler = StandardScaler() scaler.fit(observation_examples) featurizer = FeatureUnion([ ("rbf1", RBFSampler(gamma=0.05, n_components=1000)), ("rbf2", RBFSampler(gamma=0.1, n_components=1000)), ("rbf3", RBFSampler(gamma=0.5, n_components=1000)), ("rbf4", RBFSampler(gamma=1.0, n_components=1000)) ]) feature_examples = featurizer.fit_transform(scaler.transform(observation_examples)) self.dimensions = feature_examples.shape[1] # [20000, 4000] -> 4000 self.scaler = scaler self.featurizer = featurizer
def __init__(self, env, n_components=500): observation_examples = np.array( [env.observation_space.sample() for x in range(10000)], dtype=np.float64) scaler = StandardScaler() scaler.fit(observation_examples) # concatenate => add collumns => return (10000, n_components*4) featurizer = FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=n_components)), ("rbf2", RBFSampler(gamma=2.0, n_components=n_components)), ("rbf3", RBFSampler(gamma=1.0, n_components=n_components)), ("rbf4", RBFSampler(gamma=0.5, n_components=n_components)) ]) example_features = featurizer.fit_transform( scaler.transform(observation_examples)) self.dimension = example_features.shape[1] self.scaler = scaler self.featurizer = featurizer
def __init__(self, env): observation_examples = np.array( [env.observation_space.sample() for x in range(10000)]) scaler = StandardScaler() scaler.fit(observation_examples) featurizer = FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=500)), ("rbf2", RBFSampler(gamma=2.0, n_components=500)), ("rbf3", RBFSampler(gamma=1.0, n_components=500)), ("rbf4", RBFSampler(gamma=0.5, n_components=500)) ]) example_features = featurizer.fit_transform( scaler.transform(observation_examples)) self.dimensions = example_features.shape[1] self.scaler = scaler self.featurizer = featurizer
def __init__(self, env): # observation_examples = np.array([env.observation_space.sample() for x in range(10000)]) # NOTE!! state samples are poor, b/c you get velocities --> infinity observation_examples = np.random.random((20000, 4))*2 - 1 scaler = StandardScaler() scaler.fit(observation_examples) # Used to converte a state to a featurizes represenation. # We use RBF kernels with different variances to cover different parts of the space featurizer = FeatureUnion([ ("rbf1", RBFSampler(gamma=0.05, n_components=1000)), ("rbf2", RBFSampler(gamma=1.0, n_components=1000)), ("rbf3", RBFSampler(gamma=0.5, n_components=1000)), ("rbf4", RBFSampler(gamma=0.1, n_components=1000)) ]) feature_examples = featurizer.fit_transform(scaler.transform(observation_examples)) self.dimensions = feature_examples.shape[1] self.scaler = scaler self.featurizer = featurizer
def __init__(self, n_components=500): #lets say [8,-8] observation_examples = 8 * (np.random.random((20000, 1)) * 2 - 1) scaler = StandardScaler() scaler.fit(observation_examples) # Used to converte a state to a featurizes represenation. # We use RBF kernels with different variances to cover different parts of the space featurizer = FeatureUnion([ ("rbf1", RBFSampler(gamma=5.0, n_components=n_components)), ("rbf2", RBFSampler(gamma=2.0, n_components=n_components)), ("rbf3", RBFSampler(gamma=1.0, n_components=n_components)), ("rbf4", RBFSampler(gamma=0.5, n_components=n_components)) ]) example_features = featurizer.fit_transform( scaler.transform(observation_examples)) self.dimensions = example_features.shape[1] self.scaler = scaler self.featurizer = featurizer
class Featurizer: def __init__(self): # To add new features, just add a new pipeline to the feature union # The ItemSelector is used to select certain pieces of the input data # In this case, we are selecting the plaintext of the input data # TODO: Add any new feature transformers or other features to the FeatureUnion self.all_features = FeatureUnion([ ('text_stats', Pipeline([ ('selector', ItemSelector(key='text')), ('text_length', TextLengthTransformer()) ])), ('sentence_stats', Pipeline([ ('selector', ItemSelector(key='text')), ('num_sentence', NumSentenceTransformer()) ])), #Caused minor decrease but could perform well on unseen data ('pattern_stats', Pipeline([ ('selector', ItemSelector(key='text')), ('num_patterns', PatternTransformer()), ('vect', DictVectorizer()) ])), ('frequency_stats', Pipeline([ ('selector', ItemSelector(key='text')), ('tfidf', TfidfVectorizer(min_df=50)) ])), ('bigrams_stats', Pipeline([ ('selector', ItemSelector(key='text')), ('bigrams', CountVectorizer(ngram_range=(1,3))) ])), ]) def train_feature(self, examples): return self.all_features.fit_transform(examples) def test_feature(self, examples): return self.all_features.transform(examples)
def preprocess_features(features_to_process, test_set=False): # Drop unimportant columns features.drop(['Date', 'Team1_Ball_Possession(%)'], axis=1, inplace=True) # Separate categorical columns from numerical columns categorical_features_list = [ 'Location', 'Phase', 'Team1', 'Team2', 'Team1_Continent', 'Team2_Continent', 'Normal_Time' ] numerical_features = features_to_process.drop(categorical_features_list, axis=1, inplace=False) categorical_features = features_to_process[categorical_features_list].copy( ) # Preprocess features numerical_pipeline = Pipeline([ ('selector', DataFrameSelector(list(numerical_features))), ('imputer', Imputer(strategy='median')), ('std_scaler', StandardScaler()) ]) category_pipeline = Pipeline([ ('selector', DataFrameSelector(list(categorical_features))), ('cat_encoder', cs.OneHotEncoder(drop_invariant=True)) ]) if parsed_args.different_values: full_pipeline = FeatureUnion(transformer_list=[ ('num_pipeline', numerical_pipeline), ]) else: full_pipeline = FeatureUnion(transformer_list=[( 'num_pipeline', numerical_pipeline), ('cat_pipeline', category_pipeline)]) prepared_features = pd.DataFrame( data=full_pipeline.fit_transform(features_to_process), index=np.arange(1, features_to_process.shape[0] + 1)) return prepared_features
def get_relevent_questions(input_text): questions = Question.objects.all() train_data = [] train_labels = [] for question in questions: train_labels = train_labels + [question.id] answers = Answers.objects.filter(question=question.id) text = question.content for obj in answers: text = text + " " + obj.answer train_data = train_data + [text] Stop = stopwords.words('english') train_data = cleaning_text(questions, Stop) # Create feature vectors ### ---- Put here all features dict_vect = HashingVectorizer() vectorizer = TfidfVectorizer(min_df=1, max_df=0.9) count_vect = CountVectorizer() #list_comb = [('tf', vectorizer), ('cnt',count_vect), ('hs',dict_vect) ] list_comb = [('tf', vectorizer), ('cnt', count_vect)] combined_f = FeatureUnion(list_comb) X_train2 = combined_f.fit_transform(train_data) # --------------- model_svm = svm.SVC(kernel='linear', probability=True) model_svm.fit(X_train2, train_labels) # ----- X_test2 = combined_f.transform([input_text]) proba = model_svm.predict_proba(X_test2) predict_proba = np.array(model_svm.classes_) proba = np.array(proba) inds = proba.argsort() predict_proba = predict_proba[inds].tolist() #print (classification_report(test_labels, prediction)) # In[24]: return predict_proba[0][:6]
def __init__(self, env, n_components=100): # First we need to sample from the observation_space to tune the RBFs (10k samples) observation_examples = np.array( [env.observation_space.sample() for x in range(10000)]) # Define the scaler scaler = StandardScaler() scaler.fit(observation_examples) # Setup RBFsampler to transform the observation space featurizer = FeatureUnion([ ('rbf1', RBFSampler(gamma=5.0, n_components=n_components)), ('rbf2', RBFSampler(gamma=2.0, n_components=n_components)), ('rbf3', RBFSampler(gamma=1.0, n_components=n_components)), ('rbf4', RBFSampler(gamma=0.5, n_components=n_components)) ]) # WHAT IS DIS (maybe used for dimensions) example_features = featurizer.fit_transform( scaler.transform(observation_examples)) # Save all self.dimensions = example_features.shape[1] self.scaler = scaler self.featurizer = featurizer
class FeatureTransformer: def __init__(self, env, n_components=1000): observation_examples = np.array( [env.observation_space.sample() for x in range(10000)]) self._sc = StandardScaler() self._sc.fit(observation_examples) self._featurizer = FeatureUnion([ ('rbf1', RBFSampler(gamma=5.0, n_components=n_components)), ('rbf2', RBFSampler(gamma=2.0, n_components=n_components)), ('rbf3', RBFSampler(gamma=1.0, n_components=n_components)), ('rbf4', RBFSampler(gamma=.5, n_components=n_components)), ]) #do test samples = self._featurizer.fit_transform( self._sc.transform(observation_examples)) self.dimensions = samples.shape[1] def transform(self, observations): scaled_o = self._sc.transform(observations) return self._featurizer.transform(scaled_o)
def __init__(self, env: TimeLimit): observation_examples = np.hstack((np.random.random( (20000, 1)) * 4.8 - 2.4, np.random.random( (20000, 1)) * 4.0 - 2.0, np.random.random( (20000, 1)) * 0.8 - 0.4, np.random.random( (20000, 1)) * 8.0 - 4.0)) scaler = StandardScaler() scaler.fit(observation_examples) featurizer = FeatureUnion([ ('rbf1', RBFSampler(gamma=0.05, n_components=1000)), ('rbf2', RBFSampler(gamma=1.0, n_components=1000)), ('rbf3', RBFSampler(gamma=0.5, n_components=1000)), ('rbf4', RBFSampler(gamma=0.1, n_components=1000)), ]) examples = featurizer.fit_transform( scaler.transform(observation_examples)) self.scaler = scaler self.featurizer = featurizer self.dim = examples.shape[1]
def __init__(self, env): obs_examples = np.random.random((20000, 4)) print(obs_examples.shape) scaler = StandardScaler() scaler.fit(obs_examples) # Used to convert a state to a featurized representation. # We use RBF kernels with different variances to cover different parts of the space featurizer = FeatureUnion([ ("cart_position", RBFSampler(gamma=0.02, n_components=500)), ("cart_velocity", RBFSampler(gamma=1.0, n_components=500)), ("pole_angle", RBFSampler(gamma=0.5, n_components=500)), ("pole_velocity", RBFSampler(gamma=0.1, n_components=500)) ]) feature_examples = featurizer.fit_transform(scaler.transform(obs_examples)) print(feature_examples.shape) self.dimensions = feature_examples.shape[1] self.scaler = scaler self.featurizer = featurizer
def refine_data(data=load_housing_data(), train_flag=True): from sklearn.pipeline import Pipeline from sklearn.preprocessing import Imputer, StandardScaler from Encoder import DataFrameSelector, CategoricalEncoder from sklearn.pipeline import FeatureUnion housing = make_sampling_by_median_income(housing=data); housing, housing_label = strat_data_by_median_housing(housing=housing, train_flag=train_flag); # ocean_proximity is not a numeric feature housing_num = housing.drop("ocean_proximity", axis=1); num_attribs = list(housing_num); cat_attribs = ["ocean_proximity"]; # Make a pipeline for numerical feature data num_pipeline = Pipeline([ ('selector', DataFrameSelector(num_attribs)), ('imputer', Imputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]); # Make a pipeline for categorical feature data cat_pipeline = Pipeline([ ('selector', DataFrameSelector(cat_attribs)), ('cat_encoder', CategoricalEncoder(encoding="onehot-dense")), ]); # Combine pipelines full_pipeline = FeatureUnion(transformer_list= [("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline), ]); housing_prepared = full_pipeline.fit_transform(housing); # print(housing_prepared); # print(housing_prepared.shape); return housing_prepared, housing_label;
def prepare_data(data): """ Prepares the data for Machine Learning algorithm. First, we make the Feature Engineering, and then the selection of revelant features for solving the machine learning problem. And then, the mean of the data, finally the feature scaling. """ num_attrs = [ 'BMI', 'AGE_AT_ADMIT', 'Gender', 'Female', 'PreOpNarcotic', 'PreOpInsulin', 'PreOpDMMeds', 'PreOpBloodThinner', 'degre_dx', 'med_cond', ] cat_attrs = ['RawDx', 'Side'] #A numerical pipeline for transfoming the numerical features num_pipeline = Pipeline([ ('feature_engineering', PreProcessing()), ('selector', DataFrameSelector(num_attrs)), ('imputer', Imputer(strategy="mean")), ('std_scaler', StandardScaler()), ]) #Categorical pipeline for transforming textual features cat_pipeline = Pipeline([ ('selector', DataFrameSelector(cat_attrs)), ('cat_encoder', CategoricalEncoder(encoding="onehot-dense")), ]) #Union both pipelines full_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline), ]) return full_pipeline.fit_transform(data)
def convert_df_to_X_data(df, categorical_variables, numeric_variables): class DataFrame_Col_Selector(BaseEstimator, TransformerMixin): def __init__(self, cols): self.cols = cols def fit(self,x,y=None): return self def transform(self, df): return df[self.cols] class DataFrame_Categorical_Converter(BaseEstimator, TransformerMixin): def fit(self, x,y=None): return self def transform(self, df): series_list = [] for c in df.columns: temp_series = df[c].astype(str) temp_series.name = c + '_str' series_list.append(temp_series) temp_data = pandas.concat(series_list, axis=1) # Convert data to dict to pass to sklearn DictVectorizer dict_data = temp_data.to_dict(orient='records') return dict_data pipeline = FeatureUnion([ ('categorical_pipeline',Pipeline([ ('col_selector', DataFrame_Col_Selector(categorical_variables)), ('dict_converter', DataFrame_Categorical_Converter()), ('dict_vectorizer', DictVectorizer()) ])), ('numeric_pipeline',Pipeline([ ('col_selector', DataFrame_Col_Selector(numeric_variables)), ('standard_scaler',StandardScaler(with_mean=True)) ])) ]) return pipeline.fit_transform(df), pipeline
def helpfulModelingPipelineLR(): #load the pickle print "Loading pickle..." #comments_discussion_df=pd.read_pickle('comments_discussion.p') X=pd.read_pickle('X_type.p') comments_discussion_df= pd.read_pickle('comments_type_discussion.p') #assign the target (session length) to y and convert to int y_actual = comments_discussion_df.iloc[:,comments_discussion_df.columns.get_loc("code_numbers"):comments_discussion_df.columns.get_loc("code_numbers")+1].astype(int) #scaling the data for feature selection X.loc[:,['users_count','comments_count']] = preprocessing.scale(X.loc[:,['users_count','comments_count']]) X_train, X_test, y_actual_train, y_actual_test = train_test_split(X, y_actual, test_size=0.3, random_state=0) print y_actual_train.head() pca = PCA(n_components=2) selection = SelectKBest(k=1) # Build estimator from PCA and Univariate selection: combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) # Use combined features to transform dataset: X_features = combined_features.fit_transform(X_train, y_actual_train) lr = LogisticRegression(C=1e4) # Do grid search over k, n_components and C: pipeline = Pipeline([("features", combined_features), ("lr", lr)]) param_grid = dict(features__pca__n_components=[1, 2, 3, 4, 5, 6, 7], features__univ_select__k=[1, 2, 3, 4, 5, 6, 7], lr__C=[0.0001,0.001,0.01, 0.1, 1, 10,100,500,1000,1e4,1e5,1e6]) grid_search = GridSearchCV(pipeline, param_grid=param_grid,scoring='accuracy', verbose=10) grid_search.fit(X_train, y_actual_train['code_numbers'].values) print(grid_search.best_estimator_) y_actual.to_pickle('y_actual_type.p') pickle.dump(grid_search.best_estimator_, open( "lr_best_estimator.p", "wb" ) )
def test_feature_union_parallel(): # test that n_jobs work for FeatureUnion X = JUNK_FOOD_DOCS fs = FeatureUnion([ ("words", CountVectorizer(analyzer='word')), ("chars", CountVectorizer(analyzer='char')), ]) fs_parallel = FeatureUnion([ ("words", CountVectorizer(analyzer='word')), ("chars", CountVectorizer(analyzer='char')), ], n_jobs=2) fs_parallel2 = FeatureUnion([ ("words", CountVectorizer(analyzer='word')), ("chars", CountVectorizer(analyzer='char')), ], n_jobs=2) fs.fit(X) X_transformed = fs.transform(X) assert_equal(X_transformed.shape[0], len(X)) fs_parallel.fit(X) X_transformed_parallel = fs_parallel.transform(X) assert_equal(X_transformed.shape, X_transformed_parallel.shape) assert_array_equal(X_transformed.toarray(), X_transformed_parallel.toarray()) # fit_transform should behave the same X_transformed_parallel2 = fs_parallel2.fit_transform(X) assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray()) # transformers should stay fit after fit_transform X_transformed_parallel2 = fs_parallel2.transform(X) assert_array_equal(X_transformed.toarray(), X_transformed_parallel2.toarray())
def preprocess_and_extract(dataset): y_dataset = np.log1p(dataset['price']) dataset = dataset.drop('price', 1) # Slight preprocessing dataset = preprocess(dataset) # Feature extraction. default_preprocessor = CountVectorizer().build_preprocessor() def build_preprocessor(field): field_idx = list(dataset.columns).index(field) # This string casting here is a bad idea, but for the time being produces a working thing. # Vectorizers need to be thought through so test set can be used with the train fitted vectorizer. return lambda x: default_preprocessor(str(x[field_idx])) vectorizer = FeatureUnion([ ('name', CountVectorizer( ngram_range=(1, 2), max_features=50000, preprocessor=build_preprocessor('name'))), ('category_name', CountVectorizer( token_pattern='.+', preprocessor=build_preprocessor('category_name'))), ('brand_name', CountVectorizer( token_pattern='.+', preprocessor=build_preprocessor('brand_name'))), ('shipping', CountVectorizer( token_pattern='\d+', preprocessor=build_preprocessor('shipping'))), ('item_condition_id', CountVectorizer( token_pattern='\d+', preprocessor=build_preprocessor('item_condition_id'))), ('item_description', TfidfVectorizer( ngram_range=(1, 3), max_features=100000, preprocessor=build_preprocessor('item_description'))), ]) X_dataset = vectorizer.fit_transform(dataset.values) return X_dataset, y_dataset, vectorizer
def test_models(X_train, y_train, X_test, y_test, models): trained_models = {} vectorizer = FeatureUnion([('tfidf_vect', TfidfVectorizer())]) X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) for key in models: model_name = key model = models[key] t1 = time.time() model.fit(X_train, y_train) t2 = time.time() predicted_y = model.predict(X_test) t3 = time.time() calculate_accuracy(y_test, predicted_y, model_name, t2 - t1, t3 - t2) trained_models[model_name] = model return (trained_models, vectorizer)
def __init__(self, env, n_components=500): observation_examples = [ env.observation_space.sample() for _ in range(10000) ] scaler = StandardScaler() scaler.fit(observation_examples) # Used to convert a state to featurized representation # We use RBF kernels with different variances to cover different parts # of the space featurizer = FeatureUnion([ ('rbf1', RBFSampler(gamma=5.0, n_components=n_components)), ('rbf2', RBFSampler(gamma=2.0, n_components=n_components)), ('rbf3', RBFSampler(gamma=1.0, n_components=n_components)), ('rbf4', RBFSampler(gamma=0.5, n_components=n_components)) ]) featurized = featurizer.fit_transform( scaler.transform(observation_examples)) self.scaler = scaler self.featurizer = featurizer self.dimension = featurized.shape[1]
def my_transform(data, label, degree, FEATURES=FEATURES): # LABEL = "Qw" LABEL = label PolynomialDegree = degree num_attribs = FEATURES cat_attribs = [LABEL] num_pipeline = Pipeline([ ('selector', DataFrameSelector(num_attribs)), ('std_scaler', StandardScaler()), ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False)) ]) cat_pipeline = Pipeline([ ('selector', DataFrameSelector(cat_attribs)) ]) full_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline), ]) return full_pipeline.fit_transform(data)
def test_same_result(self): X, Z = self.make_text_rdd(2) loc_char = CountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) dist_char = SparkCountVectorizer(analyzer="char_wb", ngram_range=(3, 3)) loc_word = CountVectorizer(analyzer="word") dist_word = SparkCountVectorizer(analyzer="word") loc_union = FeatureUnion([("chars", loc_char), ("words", loc_word)]) dist_union = SparkFeatureUnion([("chars", dist_char), ("words", dist_word)]) # test same feature names loc_union.fit(X) dist_union.fit(Z) assert_equal(loc_union.get_feature_names(), dist_union.get_feature_names()) # test same results X_transformed = loc_union.transform(X) Z_transformed = sp.vstack(dist_union.transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray()) # test same results with fit_transform X_transformed = loc_union.fit_transform(X) Z_transformed = sp.vstack(dist_union.fit_transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray()) # test same results in parallel loc_union_par = FeatureUnion([("chars", loc_char), ("words", loc_word)], n_jobs=2) dist_union_par = SparkFeatureUnion([("chars", dist_char), ("words", dist_word)], n_jobs=2) loc_union_par.fit(X) dist_union_par.fit(Z) X_transformed = loc_union_par.transform(X) Z_transformed = sp.vstack(dist_union_par.transform(Z).collect()) assert_array_equal(X_transformed.toarray(), Z_transformed.toarray())
def dump_train(): _, _, _, train_gray_data, test_gray_data, _, labels = i_p.load_data() train_df = f.make_data_df(train_gray_data, labels) test_df = f.make_test_df(test_gray_data) train_df = train_df.reset_index() test_df = test_df.reset_index() train_df.columns = ["pngname", "input", "label"] test_df.columns = ["pngname", "input"] fu = FeatureUnion(transformer_list=f.feature_transformer_rule) feature_name_list = [s.split("__")[1] for s in fu.get_feature_names()] feature_name_list.append("target") train_X = fu.fit_transform(train_df) train_y = np.concatenate(train_df["label"].apply(lambda x: x.flatten())) train_X, train_y = cl.downsampling_data(train_X, train_y, 0.2) train_dump = pd.DataFrame(np.c_[train_X, train_y], columns=feature_name_list) dump_path = os.path.abspath(os.path.dirname(__file__)) +\ "/../tmp/train_dump" train_dump.to_csv(dump_path + "/train_dump.csv", index=False)
def __init__(self, env): ''' we are not sampling from observation space rather from a uniform distribution as its state space extend to infinity. ''' observation_examples = np.random.random((20000, 4)) * 2 - 2 scalar = StandardScaler() scalar.fit(observation_examples) featurizer = FeatureUnion([ ('rbf1', RBFSampler(gamma=0.05, n_components=1000)), ('rbf2', RBFSampler(gamma=1.0, n_components=1000)), ('rbf3', RBFSampler(gamma=0.5, n_components=1000)), ('rbf4', RBFSampler(gamma=0.1, n_components=1000)) ]) feature_example = featurizer.fit_transform( scalar.transform(observation_examples)) self.dimension = feature_example.shape[1] self.scalar = scalar self.featurizer = featurizer
class Featurizer: def __init__(self): # To add new features, just add a new pipeline to the feature union # The ItemSelector is used to select certain pieces of the input data # In this case, we are selecting the plaintext of the input data # TODO: Add any new feature transformers or other features to the FeatureUnion self.all_features = FeatureUnion([ ('text_stats', Pipeline([('selector', ItemSelector(key='text')), ('text_length', TextLengthTransformer())])), ('ngrams', Pipeline([('selector', ItemSelector(key='text')), ('n_grmas', NGrams())])), ('tfidf', Pipeline([('selector', ItemSelector(key='text')), ('tfidf', Tfidf())])), ('suffix', Pipeline([ ('selector', ItemSelector(key='text')), ('suffix', Suffixes()), ])), ('pos_words', Pipeline([ ('selector', ItemSelector(key='text')), ('positive_words', pos_words()), ])), ('neg_words', Pipeline([ ('selector', ItemSelector(key='text')), ('negative_words', neg_words()), ])), ]) def train_feature(self, examples): return self.all_features.fit_transform(examples) def test_feature(self, examples): return self.all_features.transform(examples)
def main(): raw_data = load_iris() data = pd.DataFrame(raw_data["data"], columns=raw_data["feature_names"]) pipeline = FeatureUnion([ ( "1", make_pipeline( FunctionTransformer(lambda X: X.loc[:, ["sepal length (cm)"]]), # other transformations )), ( "2", make_pipeline( FunctionTransformer(lambda X: X.loc[:, ["sepal width (cm)"]]), # other transformations )) ]) X = pipeline.fit_transform(data) print(X["sepal length (cm)"].mean()) print(X["sepal width (cm)"].mean())
class FeatureTransformer: def __init__(self): #normally use env.observation_space.sample() if all samples are likely observation_samples = np.random.random((20000, 4)) * 2 - 1 self.sc = StandardScaler() self.sc.fit(observation_samples) self.feature_union = FeatureUnion([ ("rbf0", RBFSampler(gamma=0.05, n_components=1000)), ("rbf1", RBFSampler(gamma=0.1, n_components=1000)), ("rbf2", RBFSampler(gamma=0.5, n_components=1000)), ("rbf3", RBFSampler(gamma=1, n_components=1000)) ]) feature_examples = self.feature_union.fit_transform( self.sc.transform(observation_samples)) self.dimensions = feature_examples.shape[1] print("feature example: ", feature_examples.shape) def transform(self, observations): X = self.sc.transform(observations) return self.feature_union.transform(X)
def dataprepare(data): cat_attribs = ["ocean_proximity"] num_attribs = list(data.drop(cat_attribs, axis=1)) num_pipeline = Pipeline([ ('selector', DataFrameSelector(num_attribs)), ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) cat_pipeline = Pipeline([ ('selector', DataFrameSelector(cat_attribs)), ('cat_encoder', OneHotEncoder()), ]) full_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline), ]) return full_pipeline.fit_transform(data)
('item_condition_id', CountVectorizer( token_pattern='\d+', preprocessor=build_preprocessor('item_condition_id'))), ('desc_len', CountVectorizer( token_pattern='\d+', preprocessor=build_preprocessor('desc_len'))), ('name_len', CountVectorizer( token_pattern='\d+', preprocessor=build_preprocessor('name_len'))), ('item_description', TfidfVectorizer( ngram_range=(1, 3), max_features=100000, preprocessor=build_preprocessor('item_description'))), ]) X = vectorizer.fit_transform(full_df.values) X_train = X[:n_trains] Y_train = train_df.target.values.reshape(-1, 1) X_dev = X[n_trains:n_trains+n_devs] Y_dev = dev_df.target.values.reshape(-1, 1) X_test = X[n_trains+n_devs:] print(X.shape, X_train.shape, X_dev.shape, X_test.shape) print("Fitting Ridge model on training examples...") ridge_model = Ridge( solver='auto', fit_intercept=True, alpha=1.0, max_iter=100, normalize=False, tol=0.05, random_state = 1, )