train_reader = csv.reader(train_csv)
    cnt = 0
    for tweet in train_reader:
        attr = tweet[CURRENT_ATTRIBUTE + 4]
        train_attrs.append(attr)
        cnt += 1
    del train_attrs[0]

    # get y_train from train_attrs
    y_train = [[float(attr)] for attr in train_attrs]
    # chi-2 select features
    print "start feature selection"
    if (SELECTOR == 0):
        selector = SelectKBest(chi2, k=K_FOR_BEST)
    else:
        selector = SelectPercentile(score_func=chi2,
                                    percentile=SELECT_PERCENTILE)
    selector.fit(x_train, y_train)
    new_x_train = selector.transform(x_train)
    new_x_test = selector.transform(x_test)
    print "feature selection done"
    # convert y_train to right dimension
    y_train = [attr[0] for attr in y_train]

    # regression
    print "start regression"
    clf = LinearRegression()
    clf = clf.fit(new_x_train, y_train)
    result = clf.predict(new_x_test)
    print "regression done"

    for item in result:
# Some noisy data not correlated
E = np.random.uniform(0, 0.1, size=(len(iris.data), 20))

# Add the noisy data to the informative features
X = np.hstack((iris.data, E))
y = iris.target

plt.figure(1)
plt.clf()

X_indices = np.arange(X.shape[-1])

# #############################################################################
# Univariate feature selection with F-test for feature scoring
# We use the default selection function: the 10% most significant features
selector = SelectPercentile(f_classif, percentile=10)
selector.fit(X, y)
scores = -np.log10(selector.pvalues_)
scores /= scores.max()
plt.bar(X_indices - .45,
        scores,
        width=.2,
        label=r'Univariate score ($-Log(p_{value})$)',
        color='darkorange',
        edgecolor='black')

# #############################################################################
# Compare to the weights of an SVM
clf = svm.SVC(kernel='linear')
clf.fit(X, y)
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVR
from tpot.builtins import StackingEstimator
from xgboost import XGBRegressor

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: -3.4429522712567806
exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_regression, percentile=92),
    StackingEstimator(
        estimator=KNeighborsRegressor(n_neighbors=48, p=1, weights="uniform")),
    StackingEstimator(estimator=XGBRegressor(learning_rate=0.001,
                                             max_depth=1,
                                             min_child_weight=3,
                                             n_estimators=100,
                                             n_jobs=1,
                                             objective="reg:squarederror",
                                             subsample=0.9500000000000001,
                                             verbosity=0)), MinMaxScaler(),
    StackingEstimator(estimator=SGDRegressor(alpha=0.01,
                                             eta0=0.01,
                                             fit_intercept=False,
                                             l1_ratio=0.0,
                                             learning_rate="constant",
Пример #4
0
def multilabel2(weather,weatherTest):
    X = weather.frase
    y = weather[["clase1", "clase2"]]
    y = y.replace(np.nan, '', regex=True)
   
    X_train = weather.iloc [:, [0]] 
    y_train = weather.iloc [:, [1,2]] 
    y_train =y_train.replace(np.nan, '', regex=True)
    
    X_Test = weatherTest.iloc [:, [0]] 
    y_test = weatherTest.iloc [:, [1,2]] 
    y_test =y_test.replace(np.nan, '', regex=True)
   
    
    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_Test)
    y_test = np.array(y_test)

   
    pipeline = Pipeline([
        ('vectorize', CountVectorizer()),
        ('tf_idf', TfidfTransformer(norm='l2')),
        # play with the parameters and check the model size
        ('select', SelectPercentile(chi2, percentile=50)),
        ('clf', OneVsRestClassifier(SGDClassifier(loss='modified_huber')))
    ])
    
    '''
    scores = []
    kf = KFold(n_splits=10, random_state=0, shuffle=True)
    for train, test in kf.split(total):

        X_train = X[train]
        X_test = X[test]
        y_train = y[train]
        y_test = y[test]
        
        print(X_test)
        pipeline.fit(X_train, y_train)
        predicted = pipeline.predict(X_test)
        scores.append(evaluacion(y_test, predicted))
    '''
    mlb = MultiLabelBinarizer()
    y_train = mlb.fit_transform(y_train)
    y_test = mlb.transform(y_test)
    

    
    aux = []
    for test in X_test:
     
       aux.append(test[0])
    
    X_test = aux
    
    aux = []
    for train in X_train:
     
       aux.append(train[0])
    
    X_train = aux
    
    #Name: frase, dtype: object
    print(len(X))
    print(y_train.shape)
    #print(X)
   
    print(X_train)
    pipeline.fit(X_train,  y_train)
   
    #print(X_test)
    predicted = pipeline.predict(X_test)
    print("predicte")
   # print(predicted)
   
    
    recall = metrics.recall_score(y_test, predicted, average='macro')
    print("Recall: %f" % recall)
    precision = metrics.precision_score(y_test, predicted, average='macro')
    print("Precision: %f" % precision)
    f1_score = metrics.f1_score(y_test, predicted, average='macro')
    print("F1-score: %f" % f1_score)
    accuracy = metrics.accuracy_score(y_test, predicted)
    print("accuracy: %f" % accuracy)
    return recall, precision, f1_score, accuracy
Пример #5
0
##svc=SVC(kernel='linear')
#feature_selection=SelectPercentile(f_classif,percentile=10)
#parameters_for_svm={'kernel':('linear','rbf','sigmoid'),'C':[1,10,100]}
#
#svr=SVC(cache_size=800)
#clf=GridSearchCV(svr,parameters_for_svm)
#pipe_lr=Pipeline([("normalization",normalization),
#                 ("feature_selection",feature_selection),
#                ("clf",clf)])
#pipe_lr.fit(X_train,y_train)
#score=pipe_lr.score(X_test,y_test)
"""
kfold test
"""
normalization = StandardScaler()
feature_selection = SelectPercentile(f_classif, percentile=10)
classify = SVC(C=1, cache_size=800, kernel='linear')
pipeline = Pipeline([("normalization", normalization),
                     ("feature_selecation", feature_selection),
                     ("classify", classify)])
kfold = StratifiedKFold(y=y_train, n_folds=10, random_state=1)
scores = []
fpr = dict()
tpr = dict()
feature_selected = dict()
support_vector = dict()
roc_auc = dict()
for k, (train, test) in enumerate(kfold):

    pipeline.fit(X_train[train], y_train[train])
    score = pipeline.score(X_train[test], y_train[test])
def predictNConPR():
    #feature selection
    X, y, vectorizer = get_X_y()
    #selector = SelectKBest(f_classif,500)
    selector = SelectPercentile(f_classif, percentile=100)
    selector.fit(X, y)
    best_indices = selector.get_support(indices=True)
    best_features = np.array(vectorizer.get_feature_names())[best_indices]
    X = selector.transform(X)

    #use cross validation to choose the best parameter
    lr = LogisticRegression(penalty="l2",
                            fit_intercept=True,
                            class_weight='auto')
    kf = StratifiedKFold(y, n_folds=5, shuffle=True)
    parameters = {"C": [1.0, .1, .01, .001, 0.0001]}
    clf0 = GridSearchCV(lr, parameters, scoring='roc_auc', cv=kf)
    print "fitting model..."
    clf0.fit(X, y)
    print "best auc score is: ", clf0.best_score_
    print "done."

    fs, aucs, prec, rec = [], [], [], []
    fold = 0
    complete_X = X.tocsr()
    clf = LogisticRegression(penalty="l2",
                             fit_intercept=True,
                             class_weight='auto',
                             C=clf0.best_estimator_.C)
    for train, test in kf:
        clf.fit(complete_X[train, :].tocoo(), y[train])
        probs = clf.predict_proba(complete_X[test, :])[:, 1]
        #average_precision_score(y[test],probs)
        precision, recall, threshold = precision_recall_curve(y[test], probs)

        accuracy = clf.score(complete_X[test, :], y[test])

        predLabel = clf.predict(X[test, :])
        rec.append(recall_score(y[test], predLabel))
        prec.append(precision_score(y[test], predLabel))
        #aucs.append(sklearn.metrics.roc_auc_score(y[test], probs))
        cur_auc = auc_score(y[test], probs)
        aucs.append(cur_auc)
        #preds = clf.predict(complete_X[test])
        #fs.append(f1_score(y[test], preds))
        '''
        if fold == 0:
            plt.clf()
            plt.plot(precision,recall)
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.ylim([0.0,1.05])
            plt.xlim([0.0,1.0])
            plt.title('Precision-Recall curve for news coverage prediction conditioned on press release with vocabulary size %d' %len(best_features))
            plt.show()
        fold += 1
        '''
        if fold == 0:
            fpr, tpr, thresholds = roc_curve(y[test], probs)
            pylab.clf()
            fout = "NConPR/roc"

            pylab.plot(
                fpr,
                tpr,
                label=
                "ROC curve for news coverage prediction conditioned on press release(area = %0.2f)"
                % cur_auc)
            pylab.plot([0, 1], [0, 1], 'k--')
            pylab.xlim((-0.025, 1.025))
            pylab.ylim((-0.025, 1.025))
            pylab.xlabel("false positive rate")
            pylab.ylabel("true positive rate")
            pylab.title(
                "ROC curve for news coverage prediction conditioned on press release(area = %0.2f)"
                % cur_auc)
            pylab.tight_layout()
            pylab.show()
            pylab.savefig(fout)
        fold += 1

    #print "average auc: %s" % (sum(aucs)/float(len(aucs)))
    #print "average fs: %s" % (sum(fs)/float(len(fs)))
    print "average recall: %s" % (sum(rec) / float(len(rec)))
    print "average precision: %s" % (sum(prec) / float(len(prec)))
    texify_most_informative_features(best_features, vectorizer, clf0)
    return clf0
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8833333333333334
exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_classif, percentile=36),
    LogisticRegression(C=5.0, dual=False, penalty="l1"))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #8
0
def newcolname_DoS(columns):
    selector = SelectPercentile(f_classif, percentile=10)
    true = selector.get_support
    newcolindex_DoS = [i for i, x in enumerate(true) if x]
    newcolnames_DoS = list(columns[i] for i in newcolindex_DoS)
Пример #9
0
def selectKBestUsingData(X_train, x_test):
    select = SelectPercentile(percentile=50)
    select.fit()
    X_train_selected = select.transform(X_train)
Пример #10
0
imp = SimpleImputer(strategy="most_frequent")
data_frame = pandas.DataFrame(imp.fit_transform(data_frame))

# standard_scale = StandardScaler()
# data_frame = pandas.DataFrame(standard_scale.fit_transform(data_frame.to_numpy()))

# print(data_frame)

# Train/Test Split
x_train, x_test, y_train, y_test = train_test_split(data_frame, target, random_state=0)
print(f'\nTrain data shape: {x_train.shape}')
print(f'Test data shape{x_test.shape}')
print(f'Target shape {y_test.shape}')

# Feature Selection
selection = SelectPercentile(percentile=25)
selection.fit(x_train, y_train)
x_train_compressed = selection.transform(x_train)
print(f'\nTrain shape after selection: {x_train_compressed.shape}')
selection_status = list(selection.get_support())
print(f'Selection Status: {selection_status} Length: {len(selection_status)}')
x_test_compressed = selection.transform(x_test)

# Printing Selected Column Names
i = 0
selected_columns = []
for status in selection_status:
    if status:
        selected_columns.append(data_column_names[i])
    i += 1
print(f'Columns After Feature Selection: {selected_columns} Length: {len(selected_columns)}')
Пример #11
0
def X_newDoS(X_DoS, Y_DoS):
    np.seterr(divide='ignore', invalid='ignore')
    selector = SelectPercentile(f_classif, percentile=10)
    X_newDoS = selector.fit_transform(X_DoS, Y_DoS)
svc_tuned_params = [
    {
        'kernel': ['rbf'],
        'C': [1, 10, 100, 1000],
        'gamma': [0.001, 0.0001]
    },
]
if __name__ == '__main__':
    rd = OldHamshahriReader(root=config.CORPORA_ROOT)
    docs, labels = rd.sklearn_docs(config.TOT_DOCS)
    #vectorizer = CountVectorizer(docs)
    vectorizer = TfidfVectorizer(lowercase=False, max_df=0.8)

    fs = vectorizer.fit_transform(docs)
    #vectorizer.build_preprocessor()
    selector = SelectPercentile(chi2, percentile=10)
    selector.fit(fs, labels)
    fs = selector.transform(fs)
    fs_train, fs_test, labels_train, labels_test = train_test_split(
        fs, labels, test_size=0.4, random_state=0)

    clf = None
    pred = None
    grid_search = False
    if config.CLASSIFIER == 'NaiveBayes':
        clf = BernoulliNB()
    elif config.CLASSIFIER == 'LinearSVC':
        if config.SELF_TRAINING:
            clf = LinearSVC(C=1)
        else:
            clf = GridSearchCV(LinearSVC(),
Пример #13
0
X=pd.DataFrame(X)



#calculate mi
mi = mutual_info_classif(X, y)

mi = pd.Series(mi)#One-dimensional ndarray with axis labels 
mi.index = X.columns

mi.sort_values(ascending=False, inplace = True)

mi.plot.bar(figsize = (16,5))

sel = SelectPercentile(mutual_info_classif, percentile=50).fit(X, y)
X.columns[sel.get_support()]
X.shape
X_mi = sel.transform(X)#Reduce X to the selected features.
X_mi.shape
#Build the model to compare the performance
def run_randomForest(X,y):
    #meta estimator that fits a number of decision tree
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(X, y)
    y_pred = clf.predict(X)
    print('Accuracy on mi set: ')
    print(accuracy_score(y, y_pred))
    

# Designate distributions to sample hyperparameters from 
C_range = np.power(2, np.arange(-10, 11, dtype=float))
gamma_range = np.power(2, np.arange(-10, 11, dtype=float))
n_features_to_test = [0.85, 0.9, 0.95]




#SVM
steps = [('scaler', StandardScaler()), ('red_dim', PCA()), ('clf', SVC(kernel='rbf', probability=True))]

pipeline = Pipeline(steps)


parameteres = [{'scaler':scalers_to_test, 'red_dim':[PCA(random_state=42)], 'red_dim__n_components':list(n_features_to_test),
              'clf__C': list(C_range), 'clf__gamma':['auto', 'scale']+list(gamma_range)},
              {'scaler':scalers_to_test, 'red_dim':[SelectPercentile(f_classif, percentile=10)],
              'clf__C': list(C_range), 'clf__gamma':['auto', 'scale']+list(gamma_range)},
              {'scaler':scalers_to_test, 'red_dim':[SelectPercentile(mutual_info_classif, percentile=10)],
              'clf__C': list(C_range), 'clf__gamma':['auto', 'scale']+list(gamma_range)},
              {'scaler':scalers_to_test, 'red_dim':[None],
              'clf__C': list(C_range), 'clf__gamma':['auto', 'scale']+list(gamma_range)}]

for j in range(1,2):
    results, best_estimators_dict = nested_cv.function_nested_cv(public_data, public_labels, pipeline, parameteres, j*2)

    #create folder and save

    save_output.function_save_output(results, name, j*2)
Пример #15
0
print('Base model mean squared error: ' +
      str(mean_squared_error(y_test, preds)))
print('Base model mean squared error: ' +
      str(explained_variance_score(y_test, preds)))

# Create Lasso Linear model with different Alpha values
alpha = [0.1, 0.2, 0.25, 0.5, 1.0, 2.5, 5.0]
for a in alpha:
    lasso_mod = linear_model.Lasso(alpha=a, normalize=True, fit_intercept=True)
    lasso_mod.fit(x_train, y_train)
    preds = lasso_mod.predict(x_test)
    print('R2 Lasso model with alpha = ' + str(a) + ': ' +
          str(r2_score(y_test, preds)))

# Create linear model based on F-scores, Percentile based Model
selector_f = SelectPercentile(f_regression, percentile=50)
selector_f.fit(x_train, y_train)
xt_train, xt_test = selector_f.transform(x_train), selector_f.transform(x_test)
f_model = linear_model.LinearRegression()
f_model.fit(xt_train, y_train)
preds = f_model.predict(xt_test)
print('R2 Score Percentile Based model: ' + str(r2_score(y_test, preds)))

#My own model that I created
custom_mod = linear_model.LinearRegression()
x_train, x_test, y_train, y_test = train_test_split(data_x_custom,
                                                    data_y,
                                                    test_size=0.2,
                                                    random_state=4)
custom_mod.fit(x_train, y_train)
preds = custom_mod.predict(x_test)
Пример #16
0
#SVM
steps = [('scaler', MinMaxScaler()), ('red_dim', PCA()),
         ('clf', SVC(kernel='linear', probability=True))]

pipeline = Pipeline(steps)

parameteres = [{
    'scaler': [MinMaxScaler()],
    'red_dim': [PCA(random_state=42)],
    'red_dim__n_components': list(n_features_to_test),
    'clf__C': list(C_range),
    'clf__class_weight': [None, 'balanced']
}, {
    'scaler': [MinMaxScaler()],
    'red_dim': [SelectPercentile(f_classif, percentile=10)],
    'clf__C': list(C_range),
    'clf__class_weight': [None, 'balanced']
}, {
    'scaler': [MinMaxScaler()],
    'red_dim': [SelectPercentile(mutual_info_classif, percentile=10)],
    'clf__C':
    list(C_range),
    'clf__class_weight': [None, 'balanced']
}, {
    'scaler': [MinMaxScaler()],
    'red_dim': [None],
    'clf__C': list(C_range),
    'clf__class_weight': [None, 'balanced']
}]
Пример #17
0
#Apply transform only for continuous data
X_train1 = numpy.concatenate((X_train_temp, X_train.iloc[:, size:]), axis=1)
#Concatenate scaled continuous data and categorical
X_test1 = numpy.concatenate((X_test_temp, X_test.iloc[:, size:]), axis=1)
scaled_features_test_df = pd.DataFrame(data=X_test1,
                                       index=X_test.index,
                                       columns=X_test.columns)
scaled_features_train_df = pd.DataFrame(data=X_train1,
                                        index=X_train.index,
                                        columns=X_train.columns)

# --------------
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif
# Write your solution here:
skb = SelectPercentile(score_func=f_classif, percentile=90)
predictors = skb.fit_transform(X_train1, Y_train)
scores = skb.scores_
Features = scaled_features_train_df.columns
dataframe = pd.DataFrame({'Features': Features, 'scores': scores})
dataframe = dataframe.sort_values(by='scores', ascending=False)
top_k_predictors = list(dataframe['Features'][:predictors.shape[1]])
print(top_k_predictors)

# --------------
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
clf = LogisticRegression()
clf1 = OneVsRestClassifier(clf)
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: -3.6306355316962473
exported_pipeline = make_pipeline(
    make_union(FunctionTransformer(copy), FunctionTransformer(copy)),
    SelectPercentile(score_func=f_regression, percentile=89), MaxAbsScaler(),
    StackingEstimator(estimator=LinearSVR(C=0.0001,
                                          dual=False,
                                          epsilon=0.1,
                                          loss="squared_epsilon_insensitive",
                                          tol=1e-05)),
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    LinearSVR(C=1.0,
              dual=True,
              epsilon=1.0,
              loss="epsilon_insensitive",
              tol=1e-05))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #19
0
def RegressionScoring(data, target):
    selector = SelectPercentile(f_regression, percentile=25)
    selector.fit(data, target)
    headers = data.dtypes.index
    for n, s in zip(headers, selector.scores_):
        print("F score:", s, "for feature", n)
Пример #20
0
# 获得确定性的随机数
rng = np.random.RandomState(42)
noise = rng.normal(size=(len(cancer.data), 50))

# 向数据中添加噪声特征
#
X_w_noise = np.hstack([cancer.data, noise])

X_train, X_test, y_train, y_test = train_test_split(X_w_noise,
                                                    cancer.target,
                                                    random_state=0,
                                                    test_size=.5)

# 使用 SelectPercentile来选择50% 的特征
select = SelectPercentile(percentile=50)
select.fit(X_train, y_train)

# 对训练集进行变换
X_train_selected = select.transform(X_train)

print('X_train.shape: {}'.format(X_train.shape))
print('X_train_selected.shape:{}'.format(X_train_selected.shape))

mask = select.get_support()
print(mask)

# plt.matshow(mask.reshape(1, -1), cmap='gray_r')
# plt.xlabel('Sample index')
# plt.show()
Пример #21
0
def feature_select(x, y):
    ch2 = SelectPercentile(chi2, 90)
    ch2.fit(x, y)
    train_x = ch2.transform(x)
    return train_x, ch2
Пример #22
0
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import PolynomialFeatures
from tpot.builtins import StackingEstimator
from xgboost import XGBRegressor

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=12345)

# Average CV score on the training set was:-6648.872981389077
exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_regression, percentile=17),
    SelectPercentile(score_func=f_regression, percentile=2),
    StackingEstimator(estimator=XGBRegressor(learning_rate=0.001,
                                             max_depth=9,
                                             min_child_weight=17,
                                             n_estimators=100,
                                             nthread=1,
                                             subsample=0.5)),
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    ElasticNetCV(l1_ratio=1.0, tol=0.1))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVR
from tpot.builtins import StackingEstimator
from xgboost import XGBRegressor

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: -3.3546954457321903
exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_regression, percentile=92),
    StackingEstimator(
        estimator=KNeighborsRegressor(n_neighbors=48, p=1, weights="uniform")),
    StackingEstimator(estimator=XGBRegressor(learning_rate=0.001,
                                             max_depth=1,
                                             min_child_weight=3,
                                             n_estimators=100,
                                             n_jobs=1,
                                             objective="reg:squarederror",
                                             subsample=0.9500000000000001,
                                             verbosity=0)), MinMaxScaler(),
    StackingEstimator(estimator=SGDRegressor(alpha=0.01,
                                             eta0=0.01,
                                             fit_intercept=False,
                                             l1_ratio=0.0,
                                             learning_rate="constant",
Пример #24
0
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.svm import LinearSVC
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.816716439759918
exported_pipeline = make_pipeline(
    make_union(SelectPercentile(score_func=f_classif, percentile=46),
               FunctionTransformer(copy)),
    PCA(iterated_power=8, svd_solver="randomized"),
    PCA(iterated_power=8, svd_solver="randomized"),
    LinearSVC(C=0.001,
              dual=False,
              loss="squared_hinge",
              penalty="l2",
              tol=1e-05))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #25
0
# For reproducibility
np.random.seed(1000)

if __name__ == '__main__':
    # Load Boston data
    regr_data = load_boston()
    print('Boston data shape')
    print(regr_data.data.shape)

    # Select the best k features with regression test
    kb_regr = SelectKBest(f_regression)
    X_b = kb_regr.fit_transform(regr_data.data, regr_data.target)
    print('K-Best-filtered Boston dataset shape')
    print(X_b.shape)
    print('K-Best scores')
    print(kb_regr.scores_)

    # Load iris data
    class_data = load_iris()
    print('Iris dataset shape')
    print(class_data.data.shape)

    # Select the best k features using Chi^2 classification test
    perc_class = SelectPercentile(chi2, percentile=15)
    X_p = perc_class.fit_transform(class_data.data, class_data.target)
    print('Chi2-filtered Iris dataset shape')
    print(X_p.shape)
    print('Chi2 scores')
    print(perc_class.scores_)

Пример #26
0
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: -3.60771084700637
exported_pipeline = make_pipeline(
    make_union(
        make_union(
            FunctionTransformer(copy),
            FunctionTransformer(copy)
        ),
        FunctionTransformer(copy)
    ),
    SelectPercentile(score_func=f_regression, percentile=89),
    VarianceThreshold(threshold=0.1),
    MaxAbsScaler(),
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    MaxAbsScaler(),
    MaxAbsScaler(),
    LinearSVR(C=0.5, dual=True, epsilon=1.0, loss="epsilon_insensitive", tol=1e-05)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #27
0
print('Tempo Gasto = ', (fim - inicio))


# ## Fazendo feature selection

# In[22]:


from sklearn.feature_selection import SelectPercentile, f_regression

for i in range(2,12,2):
    
    print(str(i*10)+'%')
    print('-------------')
    x_new = SelectPercentile(f_regression,percentile=i*10).fit_transform(x,y)
    
    x_train,x_test,y_train,y_test = train_test_split(x_new,y,test_size = 0.3, random_state = 42)
    
    from lightgbm import LGBMRegressor

    lgbm = LGBMRegressor(random_state=42)
    lgbm_model = lgbm.fit(x_train,y_train)
    lgbm_pred = lgbm_model.predict(x_test)
    lgbm_pred_train = lgbm_model.predict(x_train)


    # Verificando a performance do modelo

    print('treino')
    print('O Score do Modelo na base de treino é : ',lgbm_model.score(x_train,y_train))
Пример #28
0
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: -3.7519859512210725
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        make_union(
            make_union(
                make_union(
                    FunctionTransformer(copy),
                    make_union(
                        FunctionTransformer(copy),
                        make_union(
                            make_union(FunctionTransformer(copy),
                                       FunctionTransformer(copy)),
                            FunctionTransformer(copy)))),
                FunctionTransformer(copy)),
            make_union(FunctionTransformer(copy), FunctionTransformer(copy)))),
    MinMaxScaler(), SelectPercentile(score_func=f_regression, percentile=87),
    LinearSVR(C=1.0,
              dual=True,
              epsilon=1.0,
              loss="epsilon_insensitive",
              tol=0.001))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Пример #29
0
X_test1 = numpy.concatenate((X_test_temp, X_test.iloc[:, 10:col - 1]), axis=1)

scaled_features_train_df = pd.DataFrame(X_train1,
                                        index=X_train.index,
                                        columns=X_train.columns)
scaled_features_test_df = pd.DataFrame(X_test1,
                                       index=X_test.index,
                                       columns=X_test.columns)

# --------------
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import f_classif
import numpy as np

# Write your solution here:
sbk = SelectPercentile(score_func=f_classif, percentile=20)
predictors = sbk.fit_transform(X_train1, Y_train)
scores = sbk.scores_
Features = X_train.columns
data = {"Features": Features, "scores": scores}
dataframe = pd.DataFrame(data)
dataframe = dataframe.sort_values(ascending=False, by='scores')
ranges = np.percentile(dataframe['scores'], 80)
top_k_predictors = list(dataframe[dataframe['scores'] >= ranges]['Features'])
print(top_k_predictors)

# --------------
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score
df = pd.DataFrame()

# Designate distributions to sample hyperparameters from 
n_features_to_test = [0.85, 0.9, 0.95]
k = np.arange(1,11)


#KNeighborsClassifier
steps = [('scaler', MinMaxScaler()), ('red_dim', PCA()), ('clf', KNeighborsClassifier())]

pipeline = Pipeline(steps)

parameteres = [{'scaler':scalers_to_test, 'red_dim':[PCA(random_state=42)], 'red_dim__n_components':n_features_to_test, 'clf__n_neighbors':k, 
                'clf__weights':['uniform', 'distance'], 'clf__algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']},
                {'scaler':scalers_to_test, 'red_dim':[SelectPercentile(f_classif, percentile=10)], 'clf__n_neighbors':k, 
                'clf__weights':['uniform', 'distance'], 'clf__algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']},
                {'scaler':scalers_to_test, 'red_dim':[SelectPercentile(mutual_info_classif, percentile=10)], 'clf__n_neighbors':k, 
                'clf__weights':['uniform', 'distance'], 'clf__algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']},
               {'scaler':scalers_to_test, 'red_dim':[None], 'clf__n_neighbors':k, 
                'clf__weights':['uniform', 'distance'], 'clf__algorithm':['auto', 'ball_tree', 'kd_tree', 'brute']}]



results = nested_cv_3_classes.function_nested_cv_3_classes(data, labels, pipeline, parameteres)

#create folder and save

save_output.function_save_output(results, name_clf)