def test_sparse_to_dense():
    todense = DenseTransformer()
    tfidf = TfidfTransformer()
    X_t = tfidf.fit_transform([[1, 2, 3]])
    assert issparse(X_t)
    X_dense = todense.transform(X_t)
    expect = np.array([[0.26726124, 0.53452248, 0.80178373]])
    assert np.allclose(X_dense, expect)
def test_sparse_to_dense():
    todense = DenseTransformer()
    tfidf = TfidfTransformer()
    X_t = tfidf.fit_transform([[1, 2, 3]])
    assert issparse(X_t)
    X_dense = todense.transform(X_t)
    expect = np.array([[0.26726124, 0.53452248, 0.80178373]])
    assert np.allclose(X_dense, expect)
示例#3
0
 def getProcessing(self, config):
     map = {
         0: Normalizer(),
         1: OneHotEncoder(),
         2: DenseTransformer()
     }
     return ('pre_' + str(config), map[config])
示例#4
0
def test_pipeline():
    rf = RandomForestClassifier(n_estimators=10)
    param_grid = [{'randomforestclassifier__n_estimators': [1, 5, 10]}]
    pipe = make_pipeline(StandardScaler(), DenseTransformer(), rf)
    if Version(sklearn_version) < Version("0.24.1"):
        grid = GridSearchCV(pipe, param_grid, cv=3, n_jobs=1, iid=False)
    else:
        grid = GridSearchCV(pipe, param_grid, cv=3, n_jobs=1)
    grid.fit(X, y)
示例#5
0
def stacking_classifier(classifiers, random_state=42):
    sclf = StackingCVClassifier(classifiers=[c[1] for c in classifiers], 
                                meta_classifier=LogisticRegression(solver='lbfgs', multi_class='auto', random_state=random_state),
                                use_features_in_secondary=True)

    return Pipeline([
        ('vect', init_vectorizer()),
        ('denser', DenseTransformer()), # StackingCV is not working with Sparse matrix (maybe this is why it sucks so much)
        ('sclf', sclf)
    ])
示例#6
0
    def fit(self, dataset, train_data):

        y_train = dataset.labels_from(train_data)

        # feature_transformation = ColumnTransformer(transformers=[
        #     ('categorical_features', OneHotEncoder(handle_unknown='ignore'), dataset.categorical_columns),
        #     ('scaled_numeric', StandardScaler(), dataset.numerical_columns)
        # ], sparse_threshold=0)

        if len(dataset.textual_columns) > 1:
            raise Exception(
                'Can only handle one textual column at the moment.')

        sparse_threshold = 0.3
        textual_column = []
        if len(dataset.textual_columns) > 0:
            sparse_threshold = 0.0
            textual_column = dataset.textual_columns[0]

        feature_transformation = ColumnTransformer(
            transformers=[
                ('categorical_features',
                 OneHotEncoder(handle_unknown='ignore'),
                 dataset.categorical_columns),
                ('scaled_numeric', StandardScaler(),
                 dataset.numerical_columns),
                ('textual_features',
                 HashingVectorizer(ngram_range=(1, 3),
                                   n_features=10000), textual_column),
            ],
            sparse_threshold=sparse_threshold)

        make_keras_picklable()
        nn_model = keras.wrappers.scikit_learn.KerasClassifier(
            build_fn=self.create_model)

        pipeline = Pipeline([('features', feature_transformation),
                             ('todense', DenseTransformer()),
                             ('learner', nn_model)])

        param_grid = {
            'learner__epochs': [50],
            'learner__batch_size': [1024],
            'learner__size_1': [4, 8],
            'learner__size_2': [2, 4],
            'learner__verbose': [1]
        }

        model = GridSearchCV(pipeline,
                             param_grid,
                             scoring=self.scoring,
                             cv=5,
                             verbose=2).fit(train_data, y_train)

        return model
示例#7
0
def build_sentiment(classifier, transformer, name, with_proba = True, **pmml_options):
	pipeline = PMMLPipeline([
		("transformer", transformer),
		("densifier", DenseTransformer()),
		("selector", SelectKBest(f_classif, k = 500)),
		("classifier", classifier)
	])
	pipeline.fit(sentiment_X, sentiment_y)
	pipeline.configure(**pmml_options)
	store_pmml(pipeline, name)
	score = DataFrame(pipeline.predict(sentiment_X), columns = ["Score"])
	if with_proba:
		score_proba = DataFrame(pipeline.predict_proba(sentiment_X), columns = ["probability(0)", "probability(1)"])
		score = pandas.concat((score, score_proba), axis = 1)
	store_csv(score, name)
示例#8
0
def train_pipeline(X, y):
    """
    Builds and trains a machine learning pipeline
    """

    numerical_col = [
        'Num nights', 'Adults', 'Children', 'Session duration', 'Sessions',
        'Avg. session length (sec)', 'Avg. pageviews per session', 'Pageviews',
        'Hits', 'Created to arrival'
    ]
    categorical_col = [
        'Language', 'Website', 'Enquiry type', 'Enquiry status',
        'Client budget', 'Country code', 'GA source', 'GA medium', 'Device',
        'Created month'
    ]

    binary_col = [
        'Flights booked', 'User agent', 'User repeat', 'User referral'
    ]
    text_col = ['Click path', 'GA keyword']
    target = ['is booking']

    # Numerical pipeline

    numerical_pipeline = make_pipeline(ColumnSelector(cols=numerical_col),
                                       SimpleImputer(strategy="median"),
                                       StandardScaler())

    # Categorical pipeline

    categorical_pipeline = make_pipeline(
        ColumnSelector(cols=categorical_col),
        SimpleImputer(strategy="constant", fill_value='None'), OneHotEncoder())

    # Binary pipeline

    binary_pipeline = make_pipeline(ColumnSelector(cols=binary_col),
                                    SimpleImputer(strategy="most_frequent"),
                                    BinaryEncoder())

    # Text pipelines

    text_pipeline_1 = make_pipeline(
        ColumnSelector(cols=['Click path']),
        SimpleImputer(strategy='constant', fill_value=''),
        ReshapeTransformer(), HashingVectorizer(n_features=2**11),
        DenseTransformer())

    text_pipeline_2 = make_pipeline(
        ColumnSelector(cols=['GA keyword']),
        SimpleImputer(strategy='constant', fill_value=''),
        ReshapeTransformer(), TfidfVectorizer(), DenseTransformer())

    # Pipeline union

    processing_pipeline = make_union(numerical_pipeline, categorical_pipeline,
                                     binary_pipeline, text_pipeline_1,
                                     text_pipeline_2)

    estimator = BalancedRandomForestClassifier(bootstrap=False,
                                               class_weight=None,
                                               criterion='gini',
                                               max_depth=60,
                                               max_features='sqrt',
                                               max_leaf_nodes=None,
                                               min_impurity_decrease=0.0,
                                               min_samples_leaf=1,
                                               min_samples_split=5,
                                               min_weight_fraction_leaf=0.0,
                                               n_estimators=472,
                                               n_jobs=1,
                                               oob_score=False,
                                               random_state=None,
                                               replacement=False,
                                               sampling_strategy='auto',
                                               verbose=0,
                                               warm_start=False)

    predictive_pipeline = make_pipeline(processing_pipeline, estimator)

    predictive_pipeline.fit(X, y)

    return predictive_pipeline
示例#9
0
print("_________________________________")

X_train, X_test, y_train, y_test = train_test_split(data,
                                                    label,
                                                    test_size=0.1,
                                                    random_state=42)

# count_vect = CountVectorizer(max_features=5000, lowercase=True, ngram_range=(3, 3), analyzer="word")
count_vect = CountVectorizer(max_features=10000,
                             min_df=1,
                             tokenizer=nltk.word_tokenize)
# selectKBest = SelectKBest(chi2, k=1000)
# truncatedSVD = TruncatedSVD(n_components=5000, n_iter=15, random_state=42)
# combined_features = FeatureUnion([("chi2", truncatedSVD), ("univ_select", selectKBest)])
tfidf_transformer = TfidfTransformer()
dense_transformer = DenseTransformer()

clf_LG = Pipeline([
    ('count_v', count_vect),
    ('tfidf', tfidf_transformer),
    # ('features', combined_features),
    ('to_dens', DenseTransformer()),
    ('lgc', RandomForestClassifier(max_depth=100, random_state=0))
])

clf_NB = Pipeline([
    ('count_v', count_vect),
    ('tfidf', tfidf_transformer),
    # ('features', combined_features),
    ('to_dens', DenseTransformer()),
    ('lnb', GaussianNB())
示例#10
0
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from mlxtend.preprocessing import DenseTransformer

import re
import numpy as np

X_train = np.array(['abc def ghi', 'this is a test',
                    'this is a test', 'this is a test'])
y_train = np.array([0, 0, 1, 1])


pipe_1 = Pipeline([
  ('vect', CountVectorizer()),
  ('to_dense', DenseTransformer()),
  ('clf', RandomForestClassifier())
])

parameters_1 = dict(
  clf__n_estimators = [50, 100, 200],
  clf__max_features=['sqrt', 'log2', None]
)
grid_search_1 = GridSearchCV(pipe_1,
                             parameters_1,
                             n_jobs=1,
                             verbose=1,
                             scoring='accuracy',
                             cv=2)

def test_pipeline():
    rf = RandomForestClassifier()
    param_grid = [{'randomforestclassifier__n_estimators': [1, 5, 10]}]
    pipe = make_pipeline(StandardScaler(), DenseTransformer(), rf)
    grid = GridSearchCV(pipe, param_grid, cv=3, n_jobs=1)
    grid.fit(X, y)
示例#12
0
ypred=(knc.predict(xtest))
ypred1=knc.predict(xtrain)
print(ypred)
print(list(le.inverse_transform(ypred)))
print(knc.predict_proba(xtest))
print(knc.score(xtrain,ytrain))
print(knc.kneighbors())
print(knc.kneighbors_graph())
print(r2_score(ytest,ypred))
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import NeighborhoodComponentsAnalysis
nca=NeighborhoodComponentsAnalysis(random_state=42)
from mlxtend.preprocessing import DenseTransformer
nca_pipe=(make_pipeline((NeighborhoodComponentsAnalysis()),(KNeighborsClassifier())))
print(nca_pipe)
dense=DenseTransformer()
print(dense.fit(xtrain,ytrain))
##xtrain,ytrain=dense.transform(xtrain,ytrain)
##print(nca.fit(xtrain,ytrain))
##knc.fit(nca.transform(xtrain,ytrain))
##print(knc.score(nca.transform(xtest,ytest))
##print(nca_pipe.fit(xtrain,ytrain))
##print(nca_pipe.score(xtrain,ytrain))
print(classification_report(ytest,ypred))
print(accuracy_score(ytest,ypred))
print(accuracy_score(ytrain,ypred1))
confusionmatrix=confusion_matrix(ypred,ytest)
print(confusionmatrix)
rmse=math.sqrt(mean_squared_error(ypred,ytest))
print(rmse)
plt.plot(ypred)
def test_dense_to_dense():
    todense = DenseTransformer(return_copy=False)
    np.testing.assert_array_equal(X, todense.transform(X))
示例#14
0
}

# evaluate each model
for model_name, parameters in model_parameters.items():
    model = models[model_name]
    # define steps
    steps = list()
    steps.append(
        ('c', OneHotEncoder(handle_unknown='ignore'), cat_columns_train))
    steps.append(('n', MinMaxScaler(), num_columns_train))
    # one hot encode categorical, normalize numerical
    ct = ColumnTransformer(steps)
    # wrap the model i a pipeline
    pipeline = Pipeline(steps=[('t',
                                ct), ('to_dense',
                                      DenseTransformer()), (model_name,
                                                            model)])
    # evaluate the model and store results
    grid_search_acc = evaluate_model_gridsearch(X_train,
                                                y_train.values.ravel(),
                                                pipeline,
                                                scorer=scoring_method_accuracy,
                                                parameters=parameters)
    acc_best_model = grid_search_acc.best_estimator_
    acc_best_score = grid_search_acc.best_score_
    acc_best_params = grid_search_acc.best_params_
    grid_accuracy_scores.append(acc_best_score)
    print(model_name)
    print("- acc_best_score =", acc_best_score)
    print("acc_best parameters:")
    for k, v in acc_best_params.items():
示例#15
0
    def gridsearch_with_classifiers_baseline(self):
        class_report = []
        results = []
        for vec, n in zip(
            [CountVectorizer(), TfidfVectorizer()], ["Count", "Tfidf"]):
            print("loaded the vectorizer: {}\n\n\{}".format(n, vec))

            for name, classifier, params in zip(self.names, self.classifiers,
                                                self.parameters):
                my_dict = {}
                final_results = []

                logging.info("Starting gridsearch CV..")
                logging.info(
                    "Classifier name: {}\n classifier:{}\n params{}\n".format(
                        name, classifier, params))

                clf_pipe = Pipeline([
                    ('vect', vec),
                    ('to_dense', DenseTransformer()),
                    ('clf', classifier),
                ])

                #clf_pipe = make_pipeline(vec, FunctionTransformer(lambda x: x.todense(), accept_sparse=True), classifier)

                gs_clf = GridSearchCV(clf_pipe, param_grid=params, cv=2)
                clf = gs_clf.fit(self.X_train, self.y_train)
                self.X_train
                score = clf.score(self.X_test, self.y_test)

                logging.info("{} score: {}".format(name, score))
                logging.info("{} are the best estimators".format(
                    clf.best_estimator_))

                results_to_dict = classification_report(
                    (clf.best_estimator_.predict(self.X_test)),
                    self.y_test,
                    output_dict=True)

                results_to_dict['classifier'] = name
                results_to_dict['parameters'] = clf.best_params_
                results_to_dict['vectorizer'] = n
                results_to_dict['model'] = "baseline"

                logging.info(
                    "Created dictionary with classification report: \n\n{}".
                    format(results_to_dict))
                class_report.append(results_to_dict)

                y_hats = clf.predict(self.X_test)

                final_results.append({
                    "predicted": y_hats,
                    "actual": self.y_test.values,
                    "classifier": name,
                    "vectorizer": n,
                    "model": "baseline"
                })

            results.append(final_results)

        return class_report, results
示例#16
0
    def fit(self, dataset, train_data):

        y_train = dataset.labels_from(train_data)

        # feature_transformation = ColumnTransformer(transformers=[
        #     ('categorical_features', OneHotEncoder(handle_unknown='ignore'), dataset.categorical_columns),
        #     ('scaled_numeric', StandardScaler(), dataset.numerical_columns)
        # ], sparse_threshold=0)

        if len(dataset.textual_columns) > 1:
            raise Exception(
                'Can only handle one textual column at the moment.')

        sparse_threshold = 0.3
        textual_column = []
        if len(dataset.textual_columns) > 0:
            sparse_threshold = 0.0
            textual_column = dataset.textual_columns[0]

        feature_transformation = ColumnTransformer(
            transformers=[
                ('categorical_features',
                 OneHotEncoder(handle_unknown='ignore'),
                 dataset.categorical_columns),
                ('scaled_numeric', StandardScaler(),
                 dataset.numerical_columns),
                ('textual_features',
                 HashingVectorizer(ngram_range=(1, 3),
                                   n_features=10000), textual_column),
            ],
            sparse_threshold=sparse_threshold)

        def create_model(size_1, size_2):
            nn = keras.Sequential([
                keras.layers.Dense(size_1, activation=tf.nn.relu),
                keras.layers.Dense(size_2, activation=tf.nn.relu),
                keras.layers.Dense(2, activation=tf.nn.softmax)
            ])

            nn.compile(optimizer='adam',
                       loss='sparse_categorical_crossentropy',
                       metrics=[
                           'accuracy'
                       ])  # TODO figure out how to use roc_auc here...
            return nn

        nn_model = keras.wrappers.scikit_learn.KerasClassifier(
            build_fn=create_model)

        pipeline = Pipeline([('features', feature_transformation),
                             ('todense', DenseTransformer()),
                             ('learner', nn_model)])

        param_grid = {
            'learner__epochs': [50],
            'learner__batch_size': [1024],
            'learner__size_1': [4, 8],
            'learner__size_2': [2, 4],
            'learner__verbose': [1]
        }

        model = GridSearchCV(pipeline,
                             param_grid,
                             scoring=self.scoring,
                             cv=5,
                             verbose=2).fit(train_data, y_train)

        return model
if args.pred: #do i even need the if statement if it is a required argument
    text.out()
#
#report F1 score
print("F1 Score is:", text.report_f1)


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from mlxtend.preprocessing import DenseTransformer

pipeline = Pipeline([
  ("vectorizer", TfidfVectorizer()),
  ("densifier", DenseTransformer()),
  ("classifier", XGBClassifier(random_state = 13))
])
pipeline.fit(X_train, y_train)



tfidf = TfidfVectorizer(max_features=self.max_feat)


train['vector']=vectorizer.fit_transform(train['item_name'])
train=train.drop('item_name',axis=1)
y=train.category_id
train=train.drop('category_id',axis=1)
X_train, X_test, y_train, y_test = train_test_split(train,y, test_size=0.10,stratify=y,random_state=42)
import xgboost as xgb
示例#18
0
def test_dense_to_dense():
    todense = DenseTransformer(return_copy=False)
    np.testing.assert_array_equal(X, todense.transform(X))
示例#19
0
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

from dataprep_1 import *
from dataprep_2 import cat_columns_train, num_columns_train

steps = list()
steps.append(('c', OneHotEncoder(handle_unknown='ignore'), cat_columns_train))
steps.append(('n', MinMaxScaler(), num_columns_train))

ct = ColumnTransformer(steps)

# THIS IS TO DO
# WHat model?
# WHat parameters?
final_clf = LogisticRegression(
    C=0.1, penalty='none', random_state=42)  # TODO: Include tuned parameters

pipeline = Pipeline(
    steps=[('t', ct), ('to_dense',
                       DenseTransformer()), ('insert-modelname', final_clf)])

pipeline.fit(X_train, y_train.values.ravel())

final_prediction = pipeline.predict(X_test)

prediction = np.array(
    final_prediction)  # TODO replace this with you own prediction
pd.DataFrame(prediction).to_csv("GROUP_classes_problem_census.txt",
                                index=False,
                                header=False)
示例#20
0
data, label, class_names = data_set.get_train_data_set()

indexs = random.sample(range(len(data)), 50000)
data = data[indexs]
label = label[indexs]
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    label,
                                                    test_size=0.33,
                                                    random_state=42)

count_vect = CountVectorizer()
selectKBest = SelectKBest(k=2000)
truncatedSVD = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
combined_features = FeatureUnion([("chi2", chi2()),
                                  ("univ_select", selectKBest)])
dense_transformer = DenseTransformer()
clf_NB = GaussianNB()

pipeline_NB = Pipeline([('count_v', CountVectorizer()),
                        ('features', combined_features),
                        ('to_dens', DenseTransformer()), ('clf', clf_NB)])

pipeline_NB = pipeline_NB.fit(X_train, y_train)
y_pred = pipeline_NB.predict(X_test)
print("F1 score - NB:",
      f1_score(y_test, pipeline_NB.predict(X_test), average='micro'))
print("Accuracy Score - NB:",
      accuracy_score(y_test, pipeline_NB.predict(X_test)))
cnf_matrix = confusion_matrix(y_test, y_pred)
plt.figure()
plt = plot_confusion_matrix(cnf_matrix,
示例#21
0
    cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=1, random_state=42)
    # evaluate model
    scores = cross_val_score(model, X, y, scoring=scorer, cv=cv, n_jobs=6)
    return scores


# evaluate each model
for name, model in models.items():
    # define steps
    steps = list()
    steps.append(('c', OneHotEncoder(handle_unknown='ignore'), cat_columns_train))
    steps.append(('n', MinMaxScaler(), num_columns_train))
    # one hot encode categorical, normalize numerical
    ct = ColumnTransformer(steps)
    # wrap the model i a pipeline
    pipeline = Pipeline(steps=[('t', ct), ('to_dense', DenseTransformer()), ('m', model)])
    # evaluate the model and store results
    acc_score = evaluate_model(X_train, y_train.values.ravel(), pipeline, scorer=scoring_method_accuracy)
    accuracy_scores.append(np.mean(acc_score))
    f1 = evaluate_model(X_train, y_train.values.ravel(), pipeline, scorer=scoring_method_f1)
    f1_scores.append(np.mean(f1))
    auc_sco = evaluate_model(X_train, y_train.values.ravel(), pipeline, scorer=scoring_method_roc_auc)
    roc_auc_scores.append(np.mean(auc_sco))
    model_names.append(name)
    # summarize performance
    print("acc score")
    print('>%s %.3f (%.3f)' % (name, mean(acc_score), std(acc_score)))
    print("f1 score")
    print('>%s %.3f (%.3f)' % (name, mean(f1), std(f1)))
    print("auc-roc score")
    print('>%s %.3f (%.3f)' % (name, mean(auc_sco), std(auc_sco)))