def get_prediction(row):
    if row == None:
        return
    row = int(row)
    X_train = df.drop(row, axis=0).drop('glaucoma', axis=1).values
    Y_train = df['glaucoma'].drop(row, axis=0).values
    X_test = df.loc[row, df.columns[~df.columns.isin(['glaucoma'])]]
    # psd and cdr_avgt
    pipe1 = make_pipeline(ColumnSelector(cols=(9, 10)), LogisticRegression())
    # ght and cdr_avgt
    pipe2 = make_pipeline(ColumnSelector(cols=(6, 10)), LogisticRegression())
    # ght_psd and cdr_avgt
    pipe3 = make_pipeline(ColumnSelector(cols=(11, 10)), LogisticRegression())
    # cdr psd
    pipe4 = make_pipeline(ColumnSelector(cols=(3, 9)), LogisticRegression())
    est = [('lr1', pipe1), ('lr2', pipe2), ('lr3', pipe3), ('lr4', pipe4)]
    model = VotingClassifier(estimators=est, voting='hard')
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test.values.reshape(1, -1))
    if int(y_pred) == 0:
        return html.H2(dcc.Markdown(''' **Negative** '''),
                       style={'color': 'rgb(0,0,255)'})
    else:
        return html.H2(dcc.Markdown(''' **Positive** '''),
                       style={'color': 'rgb(255,0,0)'})
Exemplo n.º 2
0
 def build_pipeline(self):
     """
     Makes a pipeline based on data_config
     This is because autosklearn does not perform automatic data encoding
     """
     categorical_list = infer_categoricals(self.X)
     preprocessing_steps = []
     if self.data_config.get("text_columns"):
         print("Applying TFIDF to text columns: {data_config.get('text_columns')}")
         preprocessing_steps.append(make_pipeline(
             ColumnSelector(cols=data_config.get("text_columns"), drop_axis=True),
             TfidfVectorizer()
         ))
         categorical_list = [c for c in categorical_list if c not in data_config["text_columns"]]
     if categorical_list:
         print(f"Applying One Hot Encoding to categorical columns: {categorical_list}")
         preprocessing_steps.append(make_pipeline(
             ColumnSelector(cols=categorical_list),
             OneHotEncoder(handle_unknown="impute")
         ))
     if preprocessing_steps:
         preprocessing_steps = make_union(*preprocessing_steps)
         preprocessing_steps = make_pipeline(preprocessing_steps, SimpleImputer())
     else:
         preprocessing_steps = SimpleImputer()
     if self.problem_type == "classification":
         automl = TPOTClassifier(**self.automl_settings)
     else:
         automl = TPOTRegressor(**self.automl_settings)
     automl_pipeline = make_pipeline(
         preprocessing_steps,
         automl
     )
     return automl_pipeline
Exemplo n.º 3
0
def stacking2():
    from sklearn.datasets import load_iris
    from mlxtend.classifier import StackingClassifier
    from mlxtend.feature_selection import ColumnSelector
    from sklearn.pipeline import make_pipeline
    from sklearn.linear_model import LogisticRegression
    from sklearn import model_selection

    iris = load_iris()
    X = iris.data
    y = iris.target

    pipe1 = make_pipeline(ColumnSelector(cols=(0, 2)), LogisticRegression())
    pipe2 = make_pipeline(ColumnSelector(cols=(1, 2, 3)), LogisticRegression())
    sclf = StackingClassifier(classifiers=[pipe1, pipe2],
                              meta_classifier=LogisticRegression(),
                              use_features_in_secondary=True,
                              store_train_meta_features=True)
    sclf.fit(X, y)
    scores = model_selection.cross_val_score(sclf,
                                             X,
                                             y,
                                             cv=5,
                                             scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
Exemplo n.º 4
0
def test_ColumnSelector_with_dataframe_and_int_columns():
    boston = datasets.load_boston()
    df_in = pd.DataFrame(boston.data, columns=boston.feature_names)
    df_out_str = ColumnSelector(cols=('INDUS', 'CHAS')).transform(df_in)
    df_out_int = ColumnSelector(cols=(2, 3)).transform(df_in)

    np.testing.assert_array_equal(df_out_str[:, 0], df_out_int[:, 0])
    np.testing.assert_array_equal(df_out_str[:, 1], df_out_int[:, 1])
Exemplo n.º 5
0
def Featurizer(categorical, numerical):
    featurizer = FeatureUnion([
        ('encode_categorical',
         Pipeline([('select_columns', ColumnSelector(categorical)),
                   ('one_hot_encode', OneHotEncoder())])),
        ('encode_normalize',
         Pipeline([('select_columns', ColumnSelector(numerical)),
                   ('one_hot_encode', StandardScaler())]))
    ])
    return featurizer
Exemplo n.º 6
0
    def test(self):
        df =pd.read_csv('MorganMACCS.csv')
        baseDf = df
        extractDf =  df['CAS'].isin(ejectCAS)
        df = df[~df['CAS'].isin(ejectCAS)]
        y = df['logTox']
        dropList = ['CAS','toxValue','logTox','HDonor', 'HAcceptors', 'AromaticHeterocycles', 'AromaticCarbocycles', 'FractionCSP3']
                    #dropList = ['CAS','toxValue','logTox']
        X = df.drop(columns=dropList)
        #Normalize
        for name in X.columns:
            if str.isdecimal(name)==True:
              if X[str(name)].sum() == 0:
                   print(name)
                   X = X.drop(columns=name)
            else:
                std =X[name].std()
                mean = X[name].mean()
                X[name] = X[name].apply(lambda x: ((x - mean) * 1 / std + 0))
        X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=2)

        cols = np.arange(1,550,1).tolist()
        cols = X.columns.tolist()
        cols = [1,2,3]
        # Initializing Classifiers
        reg1 = Ridge(random_state=1)
        #reg2 = ExtraTreesRegressor()
        reg2 = ExtraTreesRegressor(n_estimators=50,max_features= 50,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5)
        reg3 = SVR(gamma='auto',kernel='linear')
        reg4 = LGBMRegressor(boosting_type='gbdt', num_leaves= 60,learning_rate=0.06)
        pls = PLSRegression(n_components=3)
        pipe1 = make_pipeline(ColumnSelector(cols=cols), ExtraTreesRegressor(n_estimators=50))
        #linear =SGDRegressor(max_iter=1000)
        rgf = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        nbrs = KNeighborsRegressor(2)
        pipe2 = make_pipeline(ColumnSelector(cols=cols), KNeighborsRegressor(31))

        meta = ExtraTreesRegressor(n_estimators=50,max_features= 7,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5)

        stackReg = StackingRegressor(regressors=[reg1,reg2, reg3,pipe1,pls,nbrs,rgf], meta_regressor=meta,verbose=1)
        stackReg.fit(X_train, y_train)
        y_pred = stackReg.predict(X_train)
        y_val = stackReg.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test))

        reg4.fit(X_train, y_train)
        y_pred = reg4.predict(X_train)
        y_val = reg4.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test))
def train3():
    iris = datasets.load_iris()
    x = iris.data
    y = iris.target

    pipe1 = make_pipeline(ColumnSelector(cols=(0, 2)), LogisticRegression())
    pipe2 = make_pipeline(ColumnSelector(cols=(1, 2, 3)), LogisticRegression())

    sclf = StackingClassifier(classifiers=[pipe1, pipe2], meta_classifier=LogisticRegression())

    sclf.fit(x, y)
Exemplo n.º 8
0
def test_ColumnSelector_drop_axis():
    X1_in = np.ones((4, 8))
    X1_out = ColumnSelector(cols=1, drop_axis=True).transform(X1_in)
    assert X1_out.shape == (4, )

    X1_out = ColumnSelector(cols=(1, ), drop_axis=True).transform(X1_in)
    assert X1_out.shape == (4, )

    X1_out = ColumnSelector(cols=1).transform(X1_in)
    assert X1_out.shape == (4, 1)

    X1_out = ColumnSelector(cols=(1, )).transform(X1_in)
    assert X1_out.shape == (4, 1)
Exemplo n.º 9
0
def test_ColumnSelector_with_dataframe_drop_axis():
    boston = datasets.load_boston()
    df_in = pd.DataFrame(boston.data, columns=boston.feature_names)
    X1_out = ColumnSelector(cols='ZN', drop_axis=True).transform(df_in)
    assert X1_out.shape == (506, )

    X1_out = ColumnSelector(cols=('ZN', ), drop_axis=True).transform(df_in)
    assert X1_out.shape == (506, )

    X1_out = ColumnSelector(cols='ZN').transform(df_in)
    assert X1_out.shape == (506, 1)

    X1_out = ColumnSelector(cols=('ZN', )).transform(df_in)
    assert X1_out.shape == (506, 1)
Exemplo n.º 10
0
def test_ColumnSelector_with_dataframe_in_gridsearch():
    boston = datasets.load_boston()
    X = pd.DataFrame(boston.data, columns=boston.feature_names)
    y = boston.target
    pipe = make_pipeline(ColumnSelector(), LinearRegression())
    grid = {
        'columnselector__cols': [['ZN', 'RM'], ['ZN', 'RM', 'AGE'], 'ZN',
                                 ['RM']],
        'linearregression__copy_X': [True, False],
        'linearregression__fit_intercept': [True, False]
    }

    if Version(sklearn_version) < Version("0.24.1"):
        gsearch1 = GridSearchCV(estimator=pipe,
                                param_grid=grid,
                                cv=5,
                                n_jobs=1,
                                iid=False,
                                scoring='neg_mean_squared_error',
                                refit=False)
    else:
        gsearch1 = GridSearchCV(estimator=pipe,
                                param_grid=grid,
                                cv=5,
                                n_jobs=1,
                                scoring='neg_mean_squared_error',
                                refit=False)

    gsearch1.fit(X, y)
    assert gsearch1.best_params_['columnselector__cols'] == ['ZN', 'RM', 'AGE']
Exemplo n.º 11
0
def set_pipe(clf, features, filename = 'Untitled'):
    piped_clf = make_pipeline(
        (ColumnSelector(cols = features)),
        (SMOTE()),
        (clf)
    )
    piped_clf.fit(X_train,y_train)
    y_pred = piped_clf.predict(X_test)
    con_mat = confusion_matrix(y_test, y_pred)
    avg_f1 = (model_selection.cross_val_score(piped_clf, X_train, y_train, cv = 5, scoring = 'f1')).mean()
    
    print("Cross Val acc score:         ", (model_selection.cross_val_score(piped_clf, X_train, y_train, cv = 5,)).mean())
    print("Cross Val f1  score:         ", avg_f1)
    print()
    print("Overall Acc score:           ", accuracy_score(y_true=y_test, y_pred=y_pred))
    print("Recall score (Tru Pos Rate): ", recall_score(y_true=y_test, y_pred=y_pred))
    print("Precision score:             ", precision_score(y_true=y_test, y_pred=y_pred))
    print("Neg Predictive Val:          ", con_mat[0][0] / (con_mat[0][1] + con_mat[0][0]))
    print("Tru Neg Rate(Specifi):       ", con_mat[0][0] / (con_mat[1][0] + con_mat[0][0]))
    print("F1 score:                    ", f1_score(y_true=y_test, y_pred=y_pred))
    print("Auc score:                   ", roc_auc_score(y_true=y_test, y_score=y_pred))
    print(con_mat)
    print()
    (pd.DataFrame(y_pred)).to_csv(filename + 'y_pred_filt_avg.csv')
    return piped_clf, avg_f1
Exemplo n.º 12
0
def test_ColumnSelector_in_gridsearch():
    iris = datasets.load_iris()
    X, y = iris.data, iris.target
    pipe = make_pipeline(
        ColumnSelector(),
        LogisticRegression(multi_class='ovr', solver='liblinear'))
    grid = {
        'columnselector__cols': [[1, 2], [1, 2, 3], 0, [1]],
        'logisticregression__C': [0.1, 1.0, 10.0]
    }

    if Version(sklearn_version) < Version("0.24.1"):
        gsearch1 = GridSearchCV(estimator=pipe,
                                param_grid=grid,
                                iid=False,
                                cv=5,
                                n_jobs=1,
                                refit=False)
    else:
        gsearch1 = GridSearchCV(estimator=pipe,
                                param_grid=grid,
                                cv=5,
                                n_jobs=1,
                                refit=False)

    gsearch1.fit(X, y)
    assert gsearch1.best_params_['columnselector__cols'] == [1, 2, 3]
Exemplo n.º 13
0
def createStackingHeterogeneus_MLP_NB(selectFeatures):
    #
    # 10 MLP classifiers and 10 NB classifiers
    #
    for i in range(0, 10):
        if selectFeatures:
            cols = genareatedRandomFeatures()
            pipe = make_pipeline(ColumnSelector(cols=cols),
                                 buildMLPClassifier())
            list_base_classifiers.append(pipe)
        else:
            list_base_classifiers.append(buildMLPClassifier())
    for i in range(0, 10):
        if selectFeatures:
            cols = genareatedRandomFeatures()
            pipe = make_pipeline(ColumnSelector(cols=cols),
                                 buildComplementNBClassifier())
            list_base_classifiers.append(pipe)
        else:
            list_base_classifiers.append(buildComplementNBClassifier())
Exemplo n.º 14
0
def test_ColumnSelector_in_gridsearch():
    iris = load_iris()
    X, y = iris.data, iris.target
    pipe = make_pipeline(ColumnSelector(),
                         LogisticRegression())
    grid = {'columnselector__cols': [[1, 2], [1, 2, 3], 0, [1]],
            'logisticregression__C': [0.1, 1.0, 10.0]}

    gsearch1 = GridSearchCV(estimator=pipe,
                            param_grid=grid,
                            cv=5,
                            n_jobs=1,
                            refit=False)

    gsearch1.fit(X, y)
    assert gsearch1.best_params_['columnselector__cols'] == [1, 2, 3]
Exemplo n.º 15
0
    def __init__(self, models_dict):
        '''
        models_dict should be a list of tuples [(Model, list_of_features)]
        '''
        from mlxtend.classifier import StackingClassifier
        from mlxtend.feature_selection import ColumnSelector
        from sklearn.pipeline import Pipeline
        from sklearn.linear_model import LogisticRegression
        from sklearn.model_selection import StratifiedKFold

        self.cv = StratifiedKFold(n_splits=5, random_state=123)

        self.models_dict = models_dict
        models = [(i, Pipeline([('ColumnSelect', ColumnSelector(v[1])), ('Model', v[0].clf)])) for i,v in models_dict]
        self.models = models
        self.clf_stack = Model(clf = StackingClassifier(classifiers = models, meta_classifier = LogisticRegression(), name = 'Stacked ensemble'))
Exemplo n.º 16
0
def sm_col_clf_piper(X_train,
                     y_train,
                     parameters,
                     column,
                     clf,
                     scoring='f1',
                     n_jobs=6):
    pipe = make_pipeline((ColumnSelector(cols=column)), (SMOTE()), (clf))
    print(pipe.get_params().keys())
    grid = GridSearchCV(estimator=pipe,
                        param_grid=parameters,
                        cv=5,
                        n_jobs=n_jobs,
                        verbose=50,
                        scoring=scoring)
    grid.fit(X_train, y_train)
    return grid.cv_results_['mean_test_score'], grid.best_params_
def runExperiment(base_classifiers, experimentName, featureSelection=False):
    metric = 'precision'
    meta_classifier = GaussianNB()
    input = x
    for i in [10, 15, 20]:
        list_classifiers = []
        n = i // len(base_classifiers)
        for j in range(len(base_classifiers)):
            for k in range(n):
                if base_classifiers[j] == 'NB':
                    if featureSelection:
                        c = GaussianNB()
                    else:
                        c = generateNaive()
                elif base_classifiers[j] == 'MLP':
                    c = generateMLP()
                elif base_classifiers[j] == 'DT':
                    c = generateDT()
                else:
                    raise ValueError(
                        'Base classifier not identified: {}'.format(
                            base_classifiers[j]))
                if featureSelection:
                    cols = getRandomFeatures()
                    pipe = make_pipeline(ColumnSelector(cols=cols), c)
                    list_classifiers.append(pipe)
                else:
                    list_classifiers.append(c)
        test_scores = {'score': [], 'diversity': []}
        experiment = '\n*** Stacking - {} - {} base classifiers ***'.format(
            experimentName, len(list_classifiers))
        for j in range(10):
            ensemble = StackingClassifier(classifiers=list_classifiers,
                                          meta_classifier=meta_classifier)
            cv_scores = cross_validate(ensemble,
                                       input,
                                       y,
                                       scoring={
                                           'score': metric,
                                           'diversity': diversity
                                       },
                                       cv=KFold(n_splits=10))
            test_scores['score'].append(cv_scores['test_score'].mean())
            test_scores['diversity'].append(cv_scores['test_diversity'].mean())
        report(experiment, test_scores)
def runExperiment(bases, experimentName):
	metric = 'precision'
	meta_classifier = GaussianNB()
	input = x
	for i in [10, 15, 20]:
		experiment = '\n*** Stacking - {} - {} base classifiers ***'.format(experimentName, i)
		base_classifiers = []
		while len(base_classifiers) < i:
			for b in bases:
				pipe = make_pipeline(ColumnSelector(cols=getRandomCols()), b)
				base_classifiers.append(pipe)
#		test_scores = []
		test_scores = {'score':[], 'diversity':[]}
		for j in range(10):
			ensemble = StackingClassifier(classifiers=base_classifiers, meta_classifier=meta_classifier)
			cv_scores = cross_validate(ensemble, input, y, scoring={'score':metric, 'diversity':diversity}, cv=KFold(n_splits=10))
#			test_scores.append(cv_scores['test_score'].mean())
			test_scores['score'].append(cv_scores['test_score'].mean())
			test_scores['diversity'].append(cv_scores['test_diversity'].mean())
		report(experiment, test_scores)
Exemplo n.º 19
0
 def make_model_pipeline(self, model):
     """
     creates a scikit-learn pipeline.
     :param model: either "kNN" or "Ridge" only for now.
     :return: scikit-learn pipeline.
     """
     inner_cv = KFold(5, shuffle=True, random_state=1673)
     if model == 'kNN':
         parameters = {'n_neighbors': range(1, 18, 2)}
         estimator = KNeighborsRegressor(weights='distance')
         cols = self.spatial_features_indices
     if model == 'Ridge':
         parameters = {"alpha": [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
         estimator = Ridge()
         cols = self.rs_features_indices
     gridsearch = GridSearchCV(estimator=estimator,
                               param_grid=parameters,
                               cv=inner_cv,
                               scoring=self.scoring)
     pipeline = make_pipeline(ColumnSelector(cols=cols), gridsearch)
     return pipeline
Exemplo n.º 20
0
    def _make_estimators(self):
        """
        Create a stacking regessor made of pipelines.
        The pipelines are contains the column selector transformer

        """
        pipes = list()
        if self.columns_selection is None:
            for estimator in self.my_esitimators:
                pipes.append(make_pipeline(estimator))
        else:
            for idx, estimator in enumerate(self.my_esitimators):
                pipes.append(
                    make_pipeline(
                        ColumnSelector(cols=self.columns_selection[idx]),
                        estimator))

        if self.meta_regressor is None:
            self.meta_regressor = MeanRegressor()

        stregr = StackingRegressor(regressors=pipes,
                                   meta_regressor=self.meta_regressor)

        return stregr
Exemplo n.º 21
0
def test_ColumnSelector_with_dataframe():
    boston = datasets.load_boston()
    df_in = pd.DataFrame(boston.data, columns=boston.feature_names)
    df_out = ColumnSelector(cols=('ZN', 'CRIM')).transform(df_in)
    assert df_out.shape == (506, 2)
Exemplo n.º 22
0
def test_ColumnSelector():
    X1_in = np.ones((4, 8))
    X1_out = ColumnSelector(cols=(1, 3)).transform(X1_in)
    assert X1_out.shape == (4, 2)
Exemplo n.º 23
0
def train_pipeline(X, y):
    """
    Builds and trains a machine learning pipeline
    """

    numerical_col = [
        'Num nights', 'Adults', 'Children', 'Session duration', 'Sessions',
        'Avg. session length (sec)', 'Avg. pageviews per session', 'Pageviews',
        'Hits', 'Created to arrival'
    ]
    categorical_col = [
        'Language', 'Website', 'Enquiry type', 'Enquiry status',
        'Client budget', 'Country code', 'GA source', 'GA medium', 'Device',
        'Created month'
    ]

    binary_col = [
        'Flights booked', 'User agent', 'User repeat', 'User referral'
    ]
    text_col = ['Click path', 'GA keyword']
    target = ['is booking']

    # Numerical pipeline

    numerical_pipeline = make_pipeline(ColumnSelector(cols=numerical_col),
                                       SimpleImputer(strategy="median"),
                                       StandardScaler())

    # Categorical pipeline

    categorical_pipeline = make_pipeline(
        ColumnSelector(cols=categorical_col),
        SimpleImputer(strategy="constant", fill_value='None'), OneHotEncoder())

    # Binary pipeline

    binary_pipeline = make_pipeline(ColumnSelector(cols=binary_col),
                                    SimpleImputer(strategy="most_frequent"),
                                    BinaryEncoder())

    # Text pipelines

    text_pipeline_1 = make_pipeline(
        ColumnSelector(cols=['Click path']),
        SimpleImputer(strategy='constant', fill_value=''),
        ReshapeTransformer(), HashingVectorizer(n_features=2**11),
        DenseTransformer())

    text_pipeline_2 = make_pipeline(
        ColumnSelector(cols=['GA keyword']),
        SimpleImputer(strategy='constant', fill_value=''),
        ReshapeTransformer(), TfidfVectorizer(), DenseTransformer())

    # Pipeline union

    processing_pipeline = make_union(numerical_pipeline, categorical_pipeline,
                                     binary_pipeline, text_pipeline_1,
                                     text_pipeline_2)

    estimator = BalancedRandomForestClassifier(bootstrap=False,
                                               class_weight=None,
                                               criterion='gini',
                                               max_depth=60,
                                               max_features='sqrt',
                                               max_leaf_nodes=None,
                                               min_impurity_decrease=0.0,
                                               min_samples_leaf=1,
                                               min_samples_split=5,
                                               min_weight_fraction_leaf=0.0,
                                               n_estimators=472,
                                               n_jobs=1,
                                               oob_score=False,
                                               random_state=None,
                                               replacement=False,
                                               sampling_strategy='auto',
                                               verbose=0,
                                               warm_start=False)

    predictive_pipeline = make_pipeline(processing_pipeline, estimator)

    predictive_pipeline.fit(X, y)

    return predictive_pipeline
Exemplo n.º 24
0
def main(argv):
    topic = argv[0]

    flag = "1"
    # flag = "0"

    lang = "de"
    # lang = "es"
    # lang = "fr"

    # topic = "uni"
    # topic = "movie"
    # topic = "title"

    embedding = "fasttext"
    # embedding = "babylon"
    # embedding = "fbmuse"
    # embedding = "multicca"
    # embedding = "multiskip"
    # embedding = "multicluster"
    # embedding = "translationInvariance"
    # embedding = "bilbowa"

    if topic == "uni":
        path = "/home/oyku/datasets/University/"

    elif topic == "movie":
        path = "/home/oyku/datasets/Movie/"

    elif topic == "title":
        path = "/home/oyku/datasets/Article/"

    else:
        print(
            "Wrong dataset is given. It should be either uni, movie or title.")
        return

    labeled = path + topic + "_" + lang + "_blocked_original.csv"
    labeled = pd.read_csv(labeled)
    print(labeled.shape)

    date = datetime.datetime.today().strftime('%Y-%m-%d')

    if flag == "1":
        method = embedding + "_oov"
        result_path = "/home/oyku/datasets/newexperiments/classifiers/" + topic + "_oov_" + lang + ".csv"
    else:
        method = embedding
        result_path = "/home/oyku/datasets/newexperiments/classifiers/" + topic + "_" + lang + ".csv"

    embedding_features = [
        'crosslang_mean_sim',  # 0
        'crosslang_tfidf_mean_sim',  # 1
        'crosslang_max_sim',  # 2
        'crosslang_tfidf_max_sim',  # 3
        'crosslang_tfidf_max_weight',  # 4
        'crosslang_vector_composition',  # 5
        'crosslang_greedy_aligned_words',  # 6
        'crosslang_weighted_greedy_aligned_words',  # 7
        'crosslang_optimal_alignment',  # 8
        'crosslang_sif'
    ]  # 9

    hybrid_features = [
        'crosslang_aligned_words_senses_jaccard',  # 10
        'crosslang_weighted_aligned_words_senses_jaccard',  # 11
        'crosslang_aligned_words_senses_path_sim',  # 12
        'crosslang_weighted_aligned_words_senses_path_sim'
    ]  # 13

    uwn_features = [
        'crosslang_uwn_common_sense_weights',  # 14
        'crosslang_uwn_sense_similarity_path',  # 15
        'crosslang_uwn_sense_similarity_lch',  # 16
        'crosslang_uwn_sense_similarity_wup',  # 17
        'crosslang_uwn_sense_similarity_resnik',  # 18
        'crosslang_uwn_sense_similarity_jcn',  # 19
        'crosslang_uwn_sense_similarity_lin'
    ]  # 20

    oov_features = [
        'crosslang_sim_oov',  # 21
        'crosslang_number_difference'
    ]  # 22

    extra_features = embedding_features + hybrid_features + uwn_features
    features_index = [[el for el in range(0, len(extra_features))]]

    if flag == "1":
        print("Adding extra OOV treatment for numbers and still-OOV words")
        new_index = len(extra_features)
        features_index = [
            version + [new_index, new_index + 1] for version in features_index
        ]
        extra_features = extra_features + oov_features

    magellan_fts_path = path + "features/" + topic + "_" + lang + "_magellan_features.csv"
    wordembed_fts_path = path + "features/" + topic + "_" + lang + "_" + method + "_features.csv"
    uwn_fts_path = path + "features/" + topic + "_" + lang + "_uwn_features.csv"

    magellan_features = pd.read_csv(magellan_fts_path)
    wordembed_features = pd.read_csv(wordembed_fts_path)
    uwn_features = pd.read_csv(uwn_fts_path)

    train_features = pd.concat([magellan_features, wordembed_features], axis=1)
    train_features = pd.concat([train_features, uwn_features], axis=1)

    print("Running classifiers experiment on " + topic + " dataset!")
    print("Training features:  " + str(len(list(train_features))))

    exclude = ["_id", "ltable_id", "rtable_id"]
    cols = [col for col in list(train_features) if col not in exclude]
    train_features = train_features[cols]

    ## Getting the names of the function names for versions
    feature_names_version = []
    for features in features_index:
        temp = []
        for index in features:
            temp.append(extra_features[index])
        feature_names_version.append(temp)

    feature_version = {}
    version_explanation = {}
    base_features = [
        col for col in list(train_features) if "crosslang" not in col
    ]
    # features = train_features[base_features]
    # feature_version["Version 0"] = features
    # version_explanation["Version 0"] = "basic"
    for ind, feats in enumerate(feature_names_version):
        version = "Version " + str(ind + 1)
        version_explanation[version] = feats

        cols = [
            col for col in list(train_features) if any(
                col.endswith(feat) for feat in feats)
        ]
        cols = base_features + cols
        feature_version[version] = train_features[cols]

    gold = pd.DataFrame(labeled["Label"])

    cols = ['Version', 'Classifier', 'F1', 'Recall', 'Precision']
    df = pd.DataFrame(columns=cols)

    for i in range(0, len(feature_version)):
        version = "Version " + str(i + 1)
        features = feature_version[version]

        ### This part is necessary for stacking ###
        list_columns_with_features, list_indexes_columns_with_features = split_cols_into_set_features(
            features)
        pipelines = []
        for lst in list_indexes_columns_with_features:
            pipe = make_pipeline(
                ColumnSelector(cols=lst),
                XGBClassifier(random_state=7, n_estimators=350))
            pipelines.append(pipe)
        ####################################################

        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        scale = StandardScaler()
        imp.fit(features)
        imp.statistics_[pd.np.isnan(imp.statistics_)] = 0
        features = scale.fit_transform(imp.transform(features))

        models = []
        models.append(('XGB', XGBClassifier(random_state=7, n_estimators=350)))
        models.append(('LR', LogisticRegression(random_state=7)))
        models.append(('DT', DecisionTreeClassifier(random_state=7)))
        models.append(('RF', RandomForestClassifier(random_state=7)))
        models.append(
            ('Ada', AdaBoostClassifier(random_state=7, n_estimators=350)))
        models.append(
            ('LGBM', lgbm.LGBMClassifier(objective='binary', random_state=7)))
        models.append(('SVM', SVC(random_state=7, C=10, gamma=0.001)))
        models.append(('Stacking_probs',
                       StackingClassifier(classifiers=pipelines,
                                          meta_classifier=XGBClassifier(
                                              random_state=7,
                                              use_probas=True,
                                              average_probas=False))))

        print(version)

        for name, model in models:
            kfold = model_selection.StratifiedKFold(n_splits=5, random_state=7)
            scoring = ['f1', 'recall', 'precision']
            scores = model_selection.cross_validate(model,
                                                    features,
                                                    gold.values.ravel(),
                                                    cv=kfold,
                                                    scoring=scoring)
            f1 = "%.3f (%.3f)" % (scores['test_f1'].mean() * 100,
                                  scores['test_f1'].std() * 100)
            recall = "%.3f (%.3f)" % (scores['test_recall'].mean() * 100,
                                      scores['test_recall'].std() * 100)
            precision = "%.3f (%.3f)" % (scores['test_precision'].mean() * 100,
                                         scores['test_precision'].std() * 100)

            print(
                "Classifier: %s --- F1: %s     Recall: %s      Precision: %s" %
                (name, f1, recall, precision))
            version_results = [version, name, f1, recall, precision]
            df.loc[len(df)] = version_results

    df.to_csv(result_path, index=False)
           2.0, grid.cv_results_[cv_keys[2]][r]))

# In[25]:

grid.best_params_

# In[26]:

eclf = eclf.set_params(**grid.best_params_)
eclf.fit(X, y).predict(X[[1, 51, 149]])

# In[27]:

from mlxtend.feature_selection import ColumnSelector

col_sel = ColumnSelector(cols=[0, 2])

clf1_pipe = Pipeline([('sel', col_sel), ('logreg', clf1)])

eclf = EnsembleVoteClassifier(clfs=[clf1_pipe, clf2, clf3], voting='soft')
eclf.fit(X, y).predict(X[[1, 51, 149]])

# In[28]:

sfs1 = SequentialFeatureSelector(clf1,
                                 k_features=2,
                                 forward=True,
                                 floating=False,
                                 scoring='accuracy',
                                 verbose=1,
                                 cv=0)
Exemplo n.º 26
0
    def stacklearning(self):
        class extAll(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                return self

            def predict(self, X):
                return self

        class extMorgan(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                _,morgan,_=sepTables(X)
                return morgan
        class extMACCS(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                maccs,morgan,_=sepTables(X)
                maccs = pd.concat([morgan,maccs],axis=1)

                return maccs

        class extDescriptor(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                maccs,morgan,descriptor=sepTables(X)
                descriptor = pd.concat([morgan,descriptor],axis=1)
                descriptor = pd.concat([maccs,descriptor],axis=1)
                return descriptor

        class extPCA(BaseEstimator, TransformerMixin):
            def __init__(self):
                pass

            def fit(self, X, y=None):
                return self

            def transform(self, X):
                model = PCA(n_components=64)
                _,morgan,_=sepTables(X)
                morgan = morgan.reset_index().drop('index', axis=1)
                W = pd.DataFrame(model.fit_transform(X))
                W = pd.concat([morgan,W],axis=1)
                return W

        lgbm = LGBMRegressor(boosting_type='gbdt', num_leaves= 60,learning_rate=0.06)
        rgf = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        rgf1 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        rgf2 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        rgf3 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        rgf4 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)

        pipe1 = make_pipeline(extMACCS(), rgf)
        pipe2 = make_pipeline(extMorgan(), rgf1)
        pipe3 = make_pipeline(extDescriptor(), rgf2)
        pipe4 = make_pipeline(extPCA(), rgf3)
        pipe7 =make_pipeline(extDescriptor(), rgf4)
        pipe8 =make_pipeline(extDescriptor(), rgf4)

        xgb = xgboost.XGBRegressor()
        nbrs = KNeighborsRegressor(2)
        svr = SVR(gamma='auto',kernel='linear')
        sgd = SGDRegressor(max_iter=1000)
        pls = PLSRegression(n_components=3)
        ext = ExtraTreesRegressor(n_estimators=30,max_features= 20,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5)

        pipe5 = make_pipeline(extMorgan(), nbrs)
        pipe6 = make_pipeline(extMACCS(), rgf)
        alldata = make_pipeline(extAll())

        meta = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=400)

        stack1 = StackingRegressor(regressors=[pipe1, pipe2, pipe3], meta_regressor=rgf, verbose=1)
        #stack2 = StackingRegressor(regressors=[stack1,nbrs, svr,pls,rgf], meta_regressor=lgbm, verbose=1)
        stack2 = StackingRegressor(regressors=[stack1,pipe5,pipe7,pipe1], meta_regressor=rgf,verbose=1)

        scores = cross_val_score(stack2, X, y, cv=10)
        print("R^2 Score: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), 'stacking'))
        stack1_score = cross_val_score(stack1,X,y, cv=10)
        rgf_score = cross_val_score(rgf,X,y,cv=10)

        stack2.fit(X_train, y_train)
        y_pred = stack2.predict(X_train)
        y_val = stack2.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        rgf.fit(X_train, y_train)
        y_pred = rgf.predict(X_train)
        y_val = rgf.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))

        pipe1.fit(X_train, y_train)
        y_pred = pipe1.predict(X_train)
        y_val = pipe1.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))


        cols = np.arange(1,550,1).tolist()
        cols = X.columns.tolist()
        cols = [1,2,3]
        # Initializing Classifiers
        reg1 = Ridge(random_state=1)
        #reg2 = ExtraTreesRegressor()
        reg2 = ExtraTreesRegressor(n_estimators=50,max_features= 50,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5)
        reg3 = SVR(gamma='auto',kernel='linear')
        reg4 = LGBMRegressor(boosting_type='gbdt', num_leaves= 60,learning_rate=0.06)
        pls = PLSRegression(n_components=3)
        pipe1 = make_pipeline(ColumnSelector(cols=cols), ExtraTreesRegressor(n_estimators=50))
        #linear =SGDRegressor(max_iter=1000)
        rgf = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0)
        nbrs = KNeighborsRegressor(2)
        pipe2 = make_pipeline(ColumnSelector(cols=cols), KNeighborsRegressor(31))

        meta = ExtraTreesRegressor(n_estimators=50,max_features= 7,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5)

        stackReg = StackingRegressor(regressors=[reg1,reg2, reg3,pipe1,pls,nbrs,rgf], meta_regressor=meta,verbose=1)
        stackReg.fit(X_train, y_train)
        y_pred = stackReg.predict(X_train)
        y_val = stackReg.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test))

        rgf.fit(X_train, y_train)
        y_pred = reg4.predict(X_train)
        y_val = reg4.predict(X_test)
        print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train))
        print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test))
        print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train))
        print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test))
Exemplo n.º 27
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

################## load packages #####################
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline

################## load data #####################
iris = datasets.load_iris()
x, y = iris.data, iris.target

################## define classifier #####################

pipe1 = make_pipeline(ColumnSelector(cols=(0, 1)), LogisticRegression())
pipe2 = make_pipeline(ColumnSelector(cols=(2, 3)), LogisticRegression())

sclf = StackingClassifier(classifiers=[pipe1, pipe2],
                          meta_classifier=LogisticRegression())

################## fit and predict #####################
sclf.fit(x, y)

print(sclf.predict(x))

########### predict class probability ###########
print(sclf.predict_proba(x))
print('Accuracy: %.2f' % grid.best_score_)

# 4.在不同特征子集上运行的分类器的堆叠
##不同的1级分类器可以适合训练数据集中的不同特征子集。以下示例说明了如何使用scikit-learn管道和ColumnSelector:
from sklearn.datasets import load_iris
from mlxtend.classifier import StackingCVClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression

iris = load_iris()
X = iris.data
y = iris.target

pipe1 = make_pipeline(
    ColumnSelector(cols=(0, 2)),  # 选择第0,2列
    LogisticRegression())
pipe2 = make_pipeline(
    ColumnSelector(cols=(1, 2, 3)),  # 选择第1,2,3列
    LogisticRegression())

sclf = StackingCVClassifier(classifiers=[pipe1, pipe2],
                            meta_classifier=LogisticRegression(),
                            random_state=42)

sclf.fit(X, y)

StackingCVClassifier(classifiers=[
    Pipeline(steps=[('columnselector', ColumnSelector(
        cols=(0, 2))), ('logisticregression', LogisticRegression())]),
    Pipeline(steps=[('columnselector', ColumnSelector(
Exemplo n.º 29
0
#coding:utf-8  
# -*- coding: utf-8 -*-  
# @Time    : 2018/5/21
# @Author  : yangguofeng  
# @File    : transfer.py  
# @Software: Sublime Test 3
import os 
import pandas as pd 
import numpy as np 
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import joblib
from mlxtend.classifier import StackingClassifier
from mlxtend.feature_selection import ColumnSelector
import matplotlib
import matplotlib.pyplot as plt 
import tensorflow as tf
import keras 
from keras.applications import VGG19, InceptionV3, Xception, ResNet50
from keras.applications.imagenet_utils import preprocess_input as preprocess_type1
from keras.applications.inception_v3 import preprocess_input as preprocess_type2
from cub_util import CUB200
NUM_CLASSES = 200
DATA_DIR = os.path.expanduser(os.path.join("/home/guofeng/yangguofeng/Transfer_Learning", "CUB_200_2011"))
CUB_DIR = os.path.join(DATA_DIR, "CUB_200_2011", "images")
FEATURES_DIR = os.path.join(DATA_DIR, "CUB_200_2011", "features")
assert os.path.exists(CUB_DIR)
Exemplo n.º 30
0
# 另外一种方法是对训练基中的特征维度进行操作的,这次不是给每一个基分类器全部的特征,
# 而是给不同的基分类器分不同的特征,即比如基分类器1训练前半部分特征,
# 基分类器2训练后半部分特征(可以通过sklearn 的pipelines 实现)。最终通过StackingClassifier组合起来。
# encoding:utf-8
# @author:zee
from sklearn.datasets import load_iris
from mlxtend.classifier import StackingClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
iris = load_iris()
X = iris.data
y = iris.target
pipe1 = make_pipeline(ColumnSelector(cols=(0, 2)),
                      LogisticRegression())
pipe2 = make_pipeline(ColumnSelector(cols=(1, 2, 3)),
                      LogisticRegression())
sclf = StackingClassifier(classifiers=[pipe1, pipe2],
                          meta_classifier=LogisticRegression())
sclf.fit(X, y)