def class_prediction_error(
    model,
    X,
    y=None,
    ax=None,
    classes=None,
    test_size=0.2,
    random_state=None,
    **kwargs):
    """Quick method:
    Divides the dataset X and y into train and test splits, fits the model on
    the train split, then scores the model on the test split. The visualizer
    displays the support for each class in the fitted classification model
    displayed as a stacked bar plot Each bar is segmented to show the
    distribution of predicted classes for each class.

    This helper function is a quick wrapper to utilize the ClassPredictionError
    ScoreVisualizer for one-off analysis.

    Parameters
    ----------
    model : the Scikit-Learn estimator (should be a classifier)

    X  : ndarray or DataFrame of shape n x m
        A matrix of n instances with m features.

    y  : ndarray or Series of length n
        An array or series of target or class values.

    ax : matplotlib axes
        The axes to plot the figure on.

    classes : list of strings
        The names of the classes in the target

    test_size : float, default=0.2
        The percentage of the data to reserve as test data.

    random_state : int or None, default=None
        The value to seed the random number generator for shuffling data.

    Returns
    -------
    ax : matplotlib axes
        Returns the axes that the class prediction error plot was drawn on.
    """
    # Instantiate the visualizer
    visualizer = ClassPredictionError(model, ax, classes, **kwargs)

    # Create the train and test splits
    X_train, X_test, y_train, y_test = tts(
        X, y, test_size=test_size, random_state=random_state
    )

    # Fit and transform the visualizer (calls draw)
    visualizer.fit(X_train, y_train, **kwargs)
    visualizer.score(X_test, y_test)

    # Return the axes object on the visualizer
    return visualizer.ax
    def test_residuals_plot_pandas(self):
        """
        Test Pandas real world dataset with image similarity on Lasso
        """
        _, ax = plt.subplots()

        # Load the occupancy dataset from fixtures
        data = self.load_data('energy')
        target = 'heating_load'
        features = [
            "relative_compactness", "surface_area", "wall_area", "roof_area",
            "overall_height", "orientation", "glazing_area",
            "glazing_area_distribution"
        ]

        # Create instances and target
        X = pd.DataFrame(data[features])
        y = pd.Series(data[target].astype(float))

        # Create train/test splits
        splits = tts(X, y, test_size=0.2, random_state=231)
        X_train, X_test, y_train, y_test = splits

        visualizer = ResidualsPlot(Lasso(random_state=44), ax=ax)
        visualizer.fit(X_train, y_train)
        visualizer.score(X_test, y_test)
        visualizer.finalize()

        self.assert_images_similar(visualizer, tol=1, remove_legend=True)
def make_fixture(binary=False, balanced=False, split=False):
    """
    Make a dataset for testing ClassBalance based on the specified params.
    """
    kwargs = {
        "n_samples":100, "n_features":20, "n_informative":8, "n_redundant":2,
        "n_clusters_per_class":1, "random_state":89092,
    }

    if binary:
        kwargs['n_classes'] = 2
        kwargs['weights'] = None if balanced else [0.3, 0.7]
    else:
        kwargs['n_classes'] = 5
        kwargs['weights'] = None if balanced else [0.1, 0.2, 0.4, 0.2, .01]

    X, y = make_classification(**kwargs)

    if split:
        X_train, X_test, y_train, y_test = tts(
            X, y, test_size=0.2, random_state=101
        )
        return Dataset(Split(X_train, X_test), Split(y_train, y_test))

    return Dataset(X, y)
def make_fruit_dataset():
    X, y = make_classification(
        n_samples=1000, n_classes=5, n_informative=3, n_clusters_per_class=1
    )

    classes = ['apple', 'kiwi', 'pear', 'banana', 'orange']
    return tts(X, y, test_size=0.20, random_state=42), classes
def make_dataset():
    data = pd.read_csv("../../../examples/data/occupancy/occupancy.csv")

    X = data[["temperature", "relative humidity", "light", "C02", "humidity"]]
    y = data.occupancy

    return tts(X, y, test_size=0.2)
    def test_pandas_integration(self):
        """
        Test with Pandas DataFrame and Series input
        """
        _, ax = plt.subplots()

        # Load the occupancy dataset from fixtures
        data = self.load_data('occupancy')
        target = 'occupancy'
        features = [
            "temperature", "relative_humidity", "light", "C02", "humidity"
        ]

        # Create instances and target
        X = pd.DataFrame(data[features])
        y = pd.Series(data[target].astype(int))

        # Create train/test splits
        splits = tts(X, y, test_size=0.2, random_state=8873)
        X_train, X_test, y_train, y_test = splits

        # Create confusion matrix
        model = GaussianNB()
        cm = ConfusionMatrix(model, ax=ax, classes=None)
        cm.fit(X_train, y_train)
        cm.score(X_test, y_test)

        tol = 0.1 if six.PY3 else 40
        self.assert_images_similar(cm, tol=tol)

        # Ensure correct confusion matrix under the hood
        npt.assert_array_equal(cm.confusion_matrix_, np.array([
            [3012,  114],
            [   1,  985]
        ]))
Пример #7
0
def linearSVR(data):
    X = data.drop(["id", "date", "price","long","lat", "zipcode","yr_renovated", "sqft_above", "sqft_basement"], axis=1)
    y = data["price"]
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.10, random_state=42)
    svr = LinearSVR(random_state=42)
    svr.fit(X_train, y_train)
    y_predict = svr.predict(X_test)
    print "r2-score for LinearSVR: %f" % r2_score(y_test, y_predict)
Пример #8
0
def ridgeRegression(data):
    from sklearn.linear_model import Ridge
    X = data.drop(["id", "date", "price","long","lat", "zipcode","yr_renovated","sqft_above","sqft_basement"], axis=1)
    y = data["price"]
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.10, random_state=42)
    ridge = Ridge(random_state=42)
    ridge.fit(X_train, y_train)
    y_predict = ridge.predict(X_test)
    print "r2-score for Ridge Regression: %f" % r2_score(y_test, y_predict)
def load_credit_dataset():
    data = pd.read_csv("../../../examples/data/credit/credit.csv")
    target = "default"
    features = list(data.columns)
    features.remove(target)

    X = data[features]
    y = data[target]

    classes = ["default", "current"]
    return tts(X, y, test_size=0.2, random_state=53), classes
def digits(request):
    """
    Creates a fixture of train and test splits for the sklearn digits dataset
    For ease of use returns a Dataset named tuple composed of two Split tuples.
    """
    data = load_digits()
    X_train, X_test, y_train, y_test = tts(
        data.data, data.target, test_size=0.2, random_state=11
    )

    # Set a class attribute for digits
    request.cls.digits = Dataset(
        Split(X_train, X_test), Split(y_train, y_test)
    )
Пример #11
0
def data(request):
    """
    Creates a fixture of train and test splits for the sklearn digits dataset
    For ease of use returns a Dataset named tuple composed of two Split tuples.
    """
    X, y = make_regression(
        n_samples=500, n_features=22, n_informative=8, random_state=42,
        noise=0.2, bias=0.2,
    )

    X_train, X_test, y_train, y_test = tts(
        X, y, test_size=0.2, random_state=11
    )

    # Set a class attribute for digits
    request.cls.data = Dataset(
        Split(X_train, X_test), Split(y_train, y_test)
    )
    def test_pandas_integration(self):
        """
        Test with Pandas DataFrame and Series input
        """
        _, ax = plt.subplots()

        # Load the occupancy dataset from fixtures
        data = self.load_data('occupancy')
        target = 'occupancy'
        features = [
            "temperature", "relative_humidity", "light", "C02", "humidity"
        ]

        # Create instances and target
        X = pd.DataFrame(data[features])
        y = pd.Series(data[target].astype(int))

        # Create train/test splits
        splits = tts(X, y, test_size=0.2, random_state=4512)
        X_train, X_test, y_train, y_test = splits

        classes = ['unoccupied', 'occupied']

        # Create classification report
        model = GaussianNB()
        viz = ClassificationReport(model, ax=ax, classes=classes)
        viz.fit(X_train, y_train)
        viz.score(X_test, y_test)

        self.assert_images_similar(viz, tol=43.0)

        # Ensure correct classification scores under the hood!
        assert viz.scores_ == {
            'precision': {
                'unoccupied': 0.999347471451876,
                'occupied': 0.8825214899713467
            }, 'recall': {
                'unoccupied': 0.9613935969868174,
                'occupied': 0.9978401727861771
            }, 'f1': {
                'unoccupied': 0.9800031994880819,
                'occupied': 0.9366447034972124
            }}
Пример #13
0
def cluster_regressors(data):

    X = data.drop(["id", "date", "price", "sqft_above", "sqft_basement"], axis=1)
    y = data["price"]

    ## split into train and test set
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.10, random_state=42)
    X_train_clustered = cluster_data(X_train)
    describe_cluster(features=["waterfront", "view", "condition", "grade", "yr_built", "yr_renovated"],
                     X=X_train_clustered,
                     n_clusters=5)
    describe_cluster(features=["bedrooms", "bathrooms", "floors"], X=X_train_clustered, n_clusters=4)

    ## Train optimized regressors
    regressors = train_optimized_regressors(X_train_clustered, y_train)

    y_predict = predict_optimized(regressors=regressors, X=X_test)

    r2_optimized = r2_score(y_test, y_predict)
    print "r2-score for Clustered Regressors: %.4f" % r2_optimized
    def test_pandas_occupancy_compare(self):
        """
        Test pandas data frame with string target in compare mode
        """
        data = self.load_data("occupancy")
        features = [
            "temperature", "relative_humidity", "light", "C02", "humidity"
        ]

        X = pd.DataFrame(data[features])
        y = pd.Series([
            "occupied" if yi else "unoccupied" for yi in data['occupancy']
        ])

        _, _, y_train, y_test = tts(X, y, test_size=0.4, random_state=2242)

        # Create and fit the visualizer
        oz = ClassBalance()
        assert oz.fit(y_train, y_test) is oz

        #oz.finalize()
        self.assert_images_similar(oz)
    def test_score_returns_score(self):
        """
        Test that ConfusionMatrix score() returns a score between 0 and 1
        """
        data = self.load_data("occupancy")
        X = data[[
            "temperature", "relative_humidity", "light", "C02", "humidity"
        ]]

        y = data['occupancy']

        # Convert X to an ndarray
        X = X.copy().view((float, len(X.dtype.names)))

        X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=42)
        # Create and fit the visualizer
        visualizer = ConfusionMatrix(LogisticRegression())
        visualizer.fit(X_train, y_train)

        # Score the visualizer
        s = visualizer.score(X_test, y_test)

        assert 0 <= s <= 1
Пример #16
0
tmp = []
for each in train['Sex']:
    if each == 'female':
        tmp.append(1)
    elif each == 'male':
        tmp.append(0)
    else:
        tmp.append(np.nan)

train['Sex'] = tmp

y = train.iloc[:, [0]].values
x = train.iloc[:, [1, 3, 4, 5, 6, 8]].values

x_train, x_test, y_train, y_test = tts(x, y, train_size=0.8, random_state=66)

#1.데이터 구성

#2) 모델

model = XGBClassifier(n_estimators=1000)
#3) 트레이닝

model.fit(x_train, y_train, eval_metric="rmse")

# model.feature_importance_
thresholds = np.sort(model.feature_importances_)
score_acc = model.score(x_test, y_test)

max = -1
# visualize how they look
num_classes = len(np.unique(y))
## class 1
for ind, val in enumerate (np.unique(y)):
    plt.scatter (x[y==val,0], x[y==val,1],
                 marker = marker_list_all[ind],
                 c = color_list_all[ind],
                 label='Class '+str(val))
plt.legend (loc = 0)
plt.xlim (x[:,0].min(), x[:,0].max())
plt.ylim (x[:,1].min(), x[:,1].max())
plt.tight_layout ()
pic1 = 'scatter-show.pdf'
plt.savefig (pic1)
plt.show ()

# separating data set
xtr, xte, ytr, yte = tts (x, y, test_size = 0.3)
# standarizing the data
sc0 = SC ()
sc0.fit (xtr)
xtr_std = sc0.transform (xtr)
xte_std = sc0.transform (xte)
# The following is for classifying
dtc =  DTC()
dtc.fit (xtr_std, ytr)
ypd = dtc.predict (xte_std)
print ("accuracy: ", dtc.score (xte_std, yte))

pdb (x, y, classifier=dtc, standardizer=sc0)
Пример #18
0
    #     # ('bayes', MultinomialNB())
    # ])
    # model.fit(docs, labels)
    # model.predict(gensim_docs)

    normal = TextNormalizer()
    norm_docs = list(normal.fit_transform(docs))
    # documents = norm_docs
    # id2word = gensim.corpora.Dictionary(documents)
    # taggeddoc = [ TaggedDocument(words, ['d{}'.format(idx)]) for idx, words in enumerate(documents) ]
    # model = Doc2Vec(taggeddoc, vector_size=5, window=2, min_count=1, workers=4)
    # docvecs = model.docvecs.vectors_docs
    gensim = GensimTfidfVectorizer(type='tfidf')
    gensim_docs = gensim.fit_transform(norm_docs)

    X_train, X_test, y_train, y_test = tts(gensim_docs, y, test_size=0.2)

    clf = LogisticRegressionCV()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    f1_score(y_pred, y_test)
    accuracy_score(y_pred, y_test)

    from gensim.models import Word2Vec
    from gensim.models.phrases import Phraser, Phrases

    common_terms = ["of", "with", "without", "and", "or", "the", "a"]

    # Create the relevant phrases from the list of sentences:
    phrases = Phrases(norm_docs, common_terms=common_terms)
    bigram = Phraser(phrases)
Пример #19
0
np.save('./data/dacon/comp3/train_target.npy', arr=y)

#1-5. id 컬럼 제외 데이터 슬라이싱
x = train[:, 1:]
x_pred = test[:, 1:]

#1-6. Scaler 후 reshape
scaler = StandardScaler()
x = scaler.fit_transform(x)
x_pred = scaler.fit_transform(x_pred)

x = x.reshape(2800, 375, 4, 1)
x_pred = x_pred.reshape(700, 375, 4, 1)

#1-7. train_test_split
x_train, x_test, y_train, y_test = tts(x, y, random_state=88, test_size=0.2)

#2. 모델 구성
input1 = Input(shape=(375, 4, 1))
dense1 = Conv2D(175, (3, 3), padding='same')(input1)
dense1 = Conv2D(75, (2, 2), padding='same')(dense1)
dense1 = MaxPooling2D(pool_size=2, padding='same')(dense1)
dense1 = Dropout(0.2)(dense1)
dense1 = Conv2D(75, (2, 2), padding='same')(dense1)
dense1 = Conv2D(15, (2, 2), padding='same')(dense1)
dense1 = Flatten()(dense1)
output1 = Dense(4)(dense1)

model = Model(inputs=input1, outputs=output1)

#3. 컴파일, 훈련
Пример #20
0
import numpy as np

dataset = load_boston()
# print(type(load_boston())) #<class 'sklearn.utils.Bunch'>
# print(print(dataset.keys()))

x= dataset.data
y= dataset.target

print(type(x)) #<class 'numpy.ndarray'>

# print(x.shape)

from sklearn.model_selection import train_test_split as tts

x_train,x_test,y_train,y_test  = tts(x,y,train_size=0.8)

n_estimators = 100
learning_rate = 0.01
colsample_bytree=0.9#우승모델은 0.6~0.9
colsample_bylevel=0.9#우승모델은 0.6~0.9
max_depth = 5
n_jobs=-1

parameters = {
    "n_estimators" : np.arange(100,301,100),
    "learning_rate" : np.arange(0.01,0.03,0.01),
    "colsample_bytree":np.arange(0.6,1,0.1),
    "colsample_bylevel":np.arange(0.6,1,0.1),
    "max_depth" : [4,5,6]
}
import numpy as np
from flask import Flask, request, jsonify, render_template
import pickle
from sklearn.preprocessing import RobustScaler as RS
import pandas as pd
scaler = RS()
n = 0
app = Flask(__name__)
model = pickle.load(open('modelfin.pkl', 'rb'))
data = pd.read_csv(r'C:\Users\Sarvesh\pop.csv')
X = data.drop(['popularity', 'ratio', 'categoryId', 'Unnamed: 0'], axis=1)
Y = data['popularity']
from sklearn.model_selection import train_test_split as tts
X_train, X_test, Y_train, Y_test = tts(X, Y, test_size=0.20, shuffle=True)
scaler.fit(X_train)


@app.route('/')
def home():
    return render_template('yt.htm')


@app.route('/predict', methods=['POST'])
def predict():
    features = [float(x) for x in request.form.values()]
    final_features = [np.array(features)]
    f = scaler.transform(final_features)
    prediction = model.predict(f)
    if (prediction[0] == 0):
        output = "High"
    if (prediction[0] == 2):
Пример #22
0
    weights=[0.3, 0.7],
    n_informative=3,
    n_redundant=1,
    flip_y=0,
    n_features=5,
    n_clusters_per_class=1,
    n_samples=5000,
    random_state=10,
)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)

# Create the samplers
enn = EditedNearestNeighbours()
renn = RepeatedEditedNearestNeighbours()

# Create the classifier
knn = KNN(1)

# Make the splits
X_train, X_test, y_train, y_test = tts(X, y, random_state=42)

# Add one transformers and two samplers in the pipeline object
pipeline = make_pipeline(pca, enn, renn, knn)

pipeline.fit(X_train, y_train)
y_hat = pipeline.predict(X_test)

print(classification_report(y_test, y_hat))
from sklearn.preprocessing import Imputer
#creating function variable using Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
#attaching the variable imputer to our matrix y
imputer = imputer.fit(y[:, :])
#now we apply our imputer variable on matrix to fill in the
#missing values will be filled with the strategy we picked
#fit() used to apply changes on a temp var in memory
#transform() used to commit the changes to the said variable
#fit_transform() for doing both together
y = imputer.transform(y)

#%%
#Splitting dataset to training and test set
'''
Training Set- from which the model will learn from
Test -with which it will compare itself and check itself

'''

from sklearn.model_selection import train_test_split as tts
y_train, y_test = tts(y, test_size=0.2, random_state=0)

#%%
#Feature Scaling- it scales the entries so that all columns are comparable to
#same scale
from sklearn.preprocessing import StandardScaler as ss
sc_x = ss()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)
Пример #24
0
#datainterpretation
data.info()
print(data.describe())

#createarrays
#x:all independent data
#y:Outcome(depenedent data)
x = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

#splituniversaldataset(train:test)
#library:sklearn
#module:model_selection
#classtrain_test_split
from sklearn.model_selection import train_test_split as tts
x_train, x_test, y_train, y_test = tts(x, y, test_size=0.3, random_state=3)

#algorithmselection
#linearregression
#library:sklearn
#module:linear_model
#class:LinearRegression
from sklearn.linear_model import LinearRegression as linreg
model_linreg = linreg()

#trainthemodel
model_linreg.fit(x_train, y_train)

#Testthemodel
#predictingoutput
y_pred = model_linreg.predict(x_test)
Пример #25
0
        with open(email, encoding="iso8859_1") as f:
            words = f.read().split(' ')
            for entry in dictionary:
                data.append(words.count(entry[0]))
            feature_set.append(data)

            if "ham" in email:
                labels.append(0)
            if "spam" in email:
                labels.append(1)
    return feature_set, labels


d = make_dict()
features, labels = make_dataset(d)
x_train, x_test, y_train, y_test = tts(features, labels,
                                       test_size=0.2)  # 80% for data training
#clasifier
clf = MultinomialNB()
clf.fit(x_train, y_train)

preds = clf.predict(x_test)
print(accuracy_score(y_test, preds))
save(clf, "text-classifier.mdl")

while True:
    features = []
    inp = input(">").split()
    if inp[0] == "exit":
        break
    for word in d:
        features.append(inp.count(word[0]))
def build_and_evaluate(balanced,
                       X,
                       y,
                       classifier=LogisticRegression,
                       outpath=None,
                       verbose=True):
    def build(balanced, classifier, X, y=None):
        """
        Inner build function that builds a single model.
        """
        if isinstance(classifier, type):
            # classifier = classifier()
            if balanced == True:
                class_weight = 'balanced'
                # neg_count = 0
                # neu_count = 0
                # pos_count = 0
                # for label in y:
                #     if label == 0:
                #         neg_count += 1
                #     elif label == 1:
                #         neu_count += 1
                #     elif label == 2:
                #         pos_count += 1
                #
                # if(len(set(y))) == 3:
                #     minimum = min(neg_count, neu_count, pos_count)
                #     class_weight = {0: minimum/neg_count, 1: minimum/neu_count, 2: minimum/pos_count}
                # elif (len(set(y))) == 2:
                #     pos_count = neu_count
                #     minimum = min(neg_count, pos_count)
                #     class_weight = {0: minimum/neg_count, 1: minimum/pos_count }
                # print('0:', neg_count, '1:', neu_count, '2:', pos_count)
                # print(class_weight)
            else:
                class_weight = None
            classifier = classifier(multi_class='multinomial',
                                    solver='saga',
                                    class_weight=class_weight)
            # classifier = classifier(max_iter=1000, class_weight = class_weight)
            # classifier = classifier(class_weight=class_weight, C=1)
            # classifier = classifier(class_weight = class_weight)

        model = Pipeline([
            ('preprocessor', NLTKPreprocessor()),
            # ('vectorizer', CountVectorizer(tokenizer=identity,preprocessor=None,lowercase=None,ngram_range =(1,2))),
            ('vectorizer',
             TfidfVectorizer(tokenizer=identity,
                             preprocessor=None,
                             lowercase=None,
                             ngram_range=(1, 2))),
            # ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold='median')),
            # ('feature_selection', SelectPercentile(percentile=50)),
            ('feature_selection',
             SelectPercentile(score_func=chi2, percentile=90)),
            # ('to_dense', DenseTransformer()),
            # ('standardization', StandardScaler(with_mean=False)),
            # ('feature_selection', VarianceThreshold(threshold=(.8 * (1 - .8)))),
            ('classifier', classifier),
        ])
        # parameters = {
        # # 'vectorizer__max_features': [85000,100000,125000,150000]
        # # 'vectorizer__max_df': [0.5,0.6,0.7,0.8]
        # # 'classifier__loss': ['log', 'modified_huber', 'squared_hinge', 'perceptron']
        # 'classifier__multi_class': ['multinomial', 'ovr'],
        # 'classifier__solver': ['newton-cg', 'sag', 'saga', 'lbfgs']
        # }
        # grid = GridSearchCV(model,param_grid=parameters)
        # grid.fit(X,y)
        #
        # print("Best: %f using %s" % (grid.best_score_,
        # grid.best_params_))
        # means = grid.cv_results_['mean_test_score']
        # stds = grid.cv_results_['std_test_score']
        # params = grid.cv_results_['params']
        # for mean, stdev, param in zip(means, stds, params):
        #     print("%f (%f) with: %r" % (mean, stdev, param))
        # return grid
        model.fit(X, y)
        return model

    # Label encode the targets
    labels = LabelEncoder()
    y = labels.fit_transform(y)

    # Begin evaluation
    if verbose:
        print("Building for evaluation")
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=0)
    model = build(balanced, classifier, X_train, y_train)

    y_pred = model.predict(X_test)
    y_actual = pd.Series(y_test, name='Actual')
    y_predicted = pd.Series(y_pred, name='Predicted')
    df_confusion = pd.crosstab(y_actual,
                               y_predicted,
                               rownames=['Actual'],
                               colnames=['Predicted'],
                               margins=True)

    if verbose:
        print("Confusion Matrix:\n")
    print(df_confusion)

    if verbose:
        print("Classification Report:\n")
    print(clsr(y_test, y_pred, target_names=labels.classes_, digits=4))
    print(
        accuracy_score(y_test, y_pred, normalize=True, sample_weight=None) *
        100)
    # seed = 7
    # kfold = StratifiedKFold(n_splits=5)
    # scores = cross_val_score(model, X_train, y_train, cv=kfold)
    # print(scores)
    # print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    if verbose:
        print("Building complete model and saving ...")
    model = build(balanced, classifier, X, y)
    model.labels_ = labels

    if outpath:
        with open(outpath, 'wb') as f:
            pickle.dump(model, f)

        print("Model written out to {}".format(outpath))

    return model
def train_test_split(features, labels, random_state):
    global features_train, features_test, labels_train, labels_test

    features_train, features_test, labels_train, labels_test = tts(
        features, labels, random_state)
Пример #28
0
from sklearn import linear_model
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts

# In[2]:

data = np.genfromtxt('linear.csv', delimiter=',')
x_data = data[1:, 0, np.newaxis]
y_data = data[1:, 1, np.newaxis]

plt.scatter(x_data, y_data)
plt.show

# In[3]:

x_train, x_test, y_train, y_test = tts(x_data, y_data, test_size=0.2)

# In[4]:

model = linear_model.LinearRegression()
model.fit(x_train, y_train)

# In[5]:

plt.scatter(x_data, y_data)
plt.plot(x_data, model.predict(x_data), c='r')
plt.show

# In[6]:

model.score(x_test, y_test)
Пример #29
0
def split_dataset(df):
    X = df.drop('G3', axis=1)
    y = df['G3']
    x_train, x_test, y_train, y_test = tts(X, y, test_size=0.2)
    return x_train, x_test, y_train, y_test
Пример #30
0
def _make_dataset(X, y, split=False):
    if split:
        X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
        return Dataset(Split(X_train, X_test), Split(y_train, y_test))
    return Dataset(X, y)
Пример #31
0
# In[73]:

X = df[['OverallQual', 'TotalBsmtSF', 'GrLivArea', 'GarageArea', '1stFlrSF']]

# In[74]:

y = df['SalePrice']

# In[75]:

from sklearn.model_selection import train_test_split as tts

# In[76]:

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3, random_state=42)

y_pred = reg.predict(X_test)
# In[77]:

from sklearn.linear_model import LinearRegression

# In[78]:

reg = LinearRegression()

# In[79]:

reg.fit(X_train, y_train)

# In[80]:
    def __buildTrainingAndEvalDatasets(self):
        ###################################################################################
        # This method builds the train and eval datasets from the given pickled data file.
        ###################################################################################
        global HEADER_COLS, EMPTY_BUSINESS_ITEM_MSG
        try:
            # Check if the train file exists
            if os.path.exists(self.trainFile) is False:
                log.error(f"Pickle file '{self.trainFile}' does not exist!")
                return False

            # Check if the file has been read successfully
            dfPckl = pandas.read_pickle(self.trainFile)
            if not dfPckl is None:
                totalInitRows = len(dfPckl)
                log.debug(dfPckl[HEADER_COLS].head(50))
                time.sleep(60)

                # Remove rows corresponding to the sectors with the following labels:
                #    "cosmetics & fragrance inc", "herman inc", "ruger & company", "inc", "inc.",
                #    "inc (formerly acxiom)", "inc. (staten island", "incorporated", "ltd." and "na".
                log.debug(f"Removing rows where '{HEADER_COLS[1]} != {', '.join(SECTOR_LABELS)}'")
                dfPckl = dfPckl[dfPckl.eval(HEADER_COLS[1]).isin(SECTOR_LABELS)]
                log.debug(dfPckl[HEADER_COLS].head(50))
                time.sleep(60)

                # Remove rows corresponding to the value: "No business text found" in the column "text".
                log.debug(f"Removing rows where '{HEADER_COLS[0]} == {EMPTY_BUSINESS_ITEM_MSG}'..")
                dfPckl = dfPckl[dfPckl.eval(HEADER_COLS[0]).str.lower() != EMPTY_BUSINESS_ITEM_MSG.lower()]
                log.debug(dfPckl[HEADER_COLS].head(50))
                time.sleep(60)

                log.debug(f"Total records in the dataframe: {totalInitRows}.")
                log.debug(f"Total records in the dataframe that were removed: {totalInitRows - len(dfPckl)}.")
                log.debug(f"Total records in the dataframe for training and evaluation: {len(dfPckl)}.")

                # Apply pre-processing on the "text" column on multiple processors
                log.debug(f"Applying pre-processing on the '{HEADER_COLS[0]}' column..")
                with mproc.Pool(NUM_CPUs) as p:
                    dfPckl[HEADER_COLS[0]] = p.map(preprocessSequenceWithoutBreakingSentence, [text for text in dfPckl[HEADER_COLS[0]]])
                log.debug(dfPckl[HEADER_COLS].head(50))
                time.sleep(60)

                # Save the pre-processed dataframe to a pickle file
                try:
                    preProcPckl = os.path.join(os.path.split(self.trainFile)[0], os.path.split(self.trainFile)[1].split(".")[0] + ".preproc.pkl")
                    dfPckl.to_pickle(preProcPckl)
                    log.info(f"Successfully saved the pre-processed training file to '{preProcPckl}'.")
                except:
                    log.error("Error saving the pre-processed dataset to file.")

                # Split into train and eval datasets
                self.trainDataset, self.evalDataset = tts(dfPckl, test_size=0.33, shuffle=True, random_state=42)
                if not self.trainDataset is None and not self.evalDataset is None:
                    log.debug(f"Total records in the dataframe for training are '{len(self.trainDataset)}'.")
                    log.debug(f"Total records in the dataframe for evaluation are '{len(self.evalDataset)}'.")
                    log.info(f"Successfully generated train and eval datasets.")
                    return True
                else:
                    log.error(f"Error generating train and eval datasets. Cannot continue with finetuning.")
                    return False
            else:
                log.error(f"Error reading the pickle file '{self.trainFile}'. Cannot continue with finetuning.")
                return False
        except:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            err = f"** ERROR ** Error occurred while building training and eval datasets from the pickled file '{self.trainFile}'. Error is: {str(exc_type)}; {str(exc_value)}."
            raise Exception(err)
Пример #33
0
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split as tts
import pickle

dataIris = load_iris()
df = pd.DataFrame(dataIris["data"], columns=["SL", "SW", "PL", "PW"])
df["target"] = dataIris["target"]
df["species"] = df["target"].apply(lambda x: dataIris["target_names"][x])

xtr, xts, ytr, yts = tts(df[["SL", "SW", "PL", "PW"]],
                         df["species"],
                         test_size=.1)

model = LogisticRegression(solver="lbfgs", multi_class="auto", max_iter=100000)
model.fit(xtr, ytr)

with open("modelPickle", "wb") as modPkl:
    pickle.dump(model, modPkl)
os.chdir("D:/GreyAtom/Datasets")
df = pd.read_csv("wbc.csv")


# In[22]:


X = df.drop(["diagnosis", "Unnamed: 32"], axis = 1)
y = df["diagnosis"]


# In[23]:


X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3, stratify=y, random_state=1)


# In[24]:


dtc = DecisionTreeClassifier(max_depth=4, min_samples_leaf=0.16, random_state=1)


# In[25]:


bc = BaggingClassifier(base_estimator=dtc, n_estimators=300, n_jobs=1)
bc.fit(X_train, y_train)

Пример #35
0
    text = ' '.join(words)
    return text


print('Cleaning of the data taking place....')

data['Text'] = data['Text'].map(cleaning)

data['Score'] = data['Score'].replace({'positive':0,'negative':1})

x = data['Text'].values
y = data['Score'].values


# splitting the data
xtrain, xtest, ytrain, ytest = tts(x,y,test_size=0.2,stratify=y)

# converting it to categorical variable
ytrain = to_categorical(ytrain)
ytest = to_categorical(ytest)

# converting to text to sequences
tokenizer = Tokenizer(25000,lower=True,oov_token='UNK')
tokenizer.fit_on_texts(xtrain)
xtrain = tokenizer.texts_to_sequences(xtrain)
xtest = tokenizer.texts_to_sequences(xtest)

xtrain = pad_sequences(xtrain,maxlen=100,padding='post')
xtest = pad_sequences(xtest,maxlen=100,padding='post')

print("Data preprocessing is over....")
Пример #36
0
def ensemble_tsfresh(forecast_in, forecast_out, season, perd):

    #### Create rolled time series for ts feature extraction

    def tsfresh_run(forecast, season, insample=True, forecast_out=None):
        df_roll_prep = forecast.reset_index()
        if insample:
            df_roll_prep = df_roll_prep.drop(["Target", "Date"], axis=1)
            df_roll_prep["id"] = 1
            target = forecast["Target"]
        else:
            df_roll_prep = df_roll_prep.drop(["index"], axis=1)
            df_roll_prep["id"] = 1

        df_roll = roll_time_series(df_roll_prep,
                                   column_id="id",
                                   column_sort=None,
                                   column_kind=None,
                                   rolling_direction=1,
                                   max_timeshift=season - 1)
        counts = df_roll['id'].value_counts()
        df_roll_cut = df_roll[df_roll['id'].isin(
            counts[counts >= season].index)]

        ### TS feature extraction
        concat_df = pd.DataFrame()
        #rap = 4 ## Change this to suit your memory capacity, the lower the more memory
        concat_df = extract_features(df_roll_cut.ffill(),
                                     column_id="id",
                                     column_sort="sort",
                                     n_jobs=season,
                                     show_warnings=False,
                                     disable_progressbar=True)

        if insample:

            concat_df = concat_df.dropna(axis=1, how="all")
            concat_df.index = target[df_roll_cut['id'].value_counts().
                                     index].sort_index().to_frame().index
            concat_df = pd.merge(target[df_roll_cut['id'].value_counts().
                                        index].sort_index().to_frame(),
                                 concat_df,
                                 left_index=True,
                                 right_index=True,
                                 how="left")
            concat_df_list = constant_feature_detect(data=concat_df,
                                                     threshold=0.95)
            concat_df = concat_df.drop(concat_df_list, axis=1)
        else:
            forecast_out.index.name = "Date"
            concat_df.index = forecast_out.index

        concat_df = impute(concat_df)

        return concat_df

    print("LightGBM ensemble have been successfully built")

    concat_df_drop_in = tsfresh_run(forecast_in, season, insample=True)

    extracted_n_selected = select_features(
        concat_df_drop_in.drop("Target", axis=1),
        concat_df_drop_in["Target"],
        fdr_level=0.01,
        n_jobs=12)  # fdr is the significance level.

    forecast_out_add = pd.concat((forecast_in.iloc[-season + 1:, :].drop(
        ["Target"], axis=1), forecast_out),
                                 axis=0)
    concat_df_drop_out = tsfresh_run(forecast_out_add,
                                     season,
                                     insample=False,
                                     forecast_out=forecast_out)
    extracted_n_selected_out = concat_df_drop_out[extracted_n_selected.columns]

    ## Reduce the dimensions of generated time series features

    pca2 = PCA(n_components=8)
    pca2.fit(extracted_n_selected)
    pca2_results_in = pca2.transform(extracted_n_selected)
    pca2_results_out = pca2.transform(extracted_n_selected_out)

    cols = 0
    for i in range(pca2_results_in.shape[1]):
        cols = cols + 1
        extracted_n_selected["pca_" + str(i)] = pca2_results_in[:, i]
        extracted_n_selected_out["pca_" + str(i)] = pca2_results_out[:, i]

    df = forecast_in.iloc[season - 1:, :].copy()
    df = time_feature(df, perd)
    df["mean"] = df.drop(["Target"], axis=1).mean(axis=1)

    df_new = pd.concat(
        (df.reset_index(),
         extracted_n_selected.iloc[:, -cols:].reset_index(drop=True)),
        axis=1)
    df_new = df_new.set_index("Date")
    forecast_train, forecast_test = tts(df_new,
                                        train_size=0.5,
                                        shuffle=False,
                                        stratify=None)
    target = "Target"
    d_train = lgb.Dataset(forecast_train.drop(columns=[target]),
                          label=forecast_train[target])
    #d_valid = lgb.Dataset(forecast_test.drop(columns=[target]), label=forecast_test[target])
    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmsle',
        'max_depth': 6,
        'learning_rate': 0.1,
        'verbose': 0,
        'num_threads': 16
    }

    model = lgb.train(params, d_train, 100, verbose_eval=1)

    ensemble_ts = pd.DataFrame(index=forecast_test.index)

    ensemble_ts["ensemble_ts"] = model.predict(
        forecast_test.drop(columns=[target]))

    df_out = forecast_out.copy()
    df_out = time_feature(df_out, perd)
    df_out["mean"] = df_out.mean(axis=1)

    ensemble_ts_out = pd.DataFrame(index=df_out.index)
    ensemble_ts_out["ensemble_ts"] = model.predict(df_out)

    print("LightGBM ensemble have been successfully built")

    return ensemble_ts, ensemble_ts_out
Пример #37
0
SVMAccuracycuracy = 0
accuracy_score = 0
logAccuracy = 0
gnbAccuracy = 0
ncAccuracy = 0

myAccuracy = 0

iterations = 5
print(" Iterating cross validation : ", end="")
for i in range(iterations):

    print(i)

    X_train, X_test, y_train, y_test = tts(
        data1, trainlabels, test_size=0.3)

    newRows = len(X_train)
    newCols = len(X_train[0])
    newRowst = len(X_test)
    newColst = len(X_test[0])

    newRowsL = len(y_train)


    PearFeatures = PearsonCorrtin(X_train, y_train, features)

    allFeatures.append(PearFeatures)
    argument = copy.deepcopy(PearFeatures)

    data_fea = dataCreation(argument, X_train)
                                     RepeatedEditedNearestNeighbours)

print(__doc__)

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=1.25, weights=[0.3, 0.7],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=5, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)

# Create the samplers
enn = EditedNearestNeighbours()
renn = RepeatedEditedNearestNeighbours()

# Create the classifier
knn = KNN(1)

# Make the splits
X_train, X_test, y_train, y_test = tts(X, y, random_state=42)

# Add one transformers and two samplers in the pipeline object
pipeline = make_pipeline(pca, enn, renn, knn)

pipeline.fit(X_train, y_train)
y_hat = pipeline.predict(X_test)

print(classification_report(y_test, y_hat))
Пример #39
0
@author: KARIS
"""
#import data
import pandas as pd
df = pd.read_csv('Bahubali2_vs_Dangal.csv')


#splitting dependent and independent values
features= df.iloc[:,:1].values
lab_bahu = df.iloc[: ,1:2].values
lab_dang = df.iloc[:,2:3].values


#splitting test and train
from sklearn.model_selection import train_test_split as tts
features_train, features_test, lab_bahu_train, lab_bahu_test,lab_dang_train , lab_dang_test = tts(
        features , lab_bahu, lab_dang , test_size = 0.2 ,random_state = 0)

#fitting data on bahubali's train
from sklearn.linear_model import LinearRegression
reg_bahu = LinearRegression()
reg_bahu.fit(features_train,lab_bahu_train)


#fitting data on dangal's train
reg_dang = LinearRegression()
reg_dang.fit(features_train,lab_dang_train)

#predicting income on 10th day
lab_pred_bahu= reg_bahu.predict(10)
lab_pred_dang= reg_dang.predict(10)
# data set separation
from sklearn.model_selection import train_test_split as tts
from decisionboundary import plot_decision_boundary as pdb
from decisionboundary import marker_list_all, color_list_all
xxor = np.random.randn(350,2)
yxor = np.logical_xor(xxor[:,0]>0, xxor[:,1]>0)
# visualize how they look
## class 1
plt.scatter (xxor[yxor==False, 0], xxor[yxor==False, 1],
             marker=marker_list_all[0], c=color_list_all[0], label='Class False')
## class 1
plt.scatter (xxor[yxor==True, 0], xxor[yxor==True, 1],
             marker=marker_list_all[1], c=color_list_all[1], label='Class True')
plt.legend (loc = 0)
plt.xlim (xxor[:,0].min(), xxor[:,0].max())
plt.ylim (xxor[:,1].min(), xxor[:,1].max())
plt.tight_layout ()
pic1 = 'scatter-show.pdf'
plt.savefig (pic1)
plt.show ()

# separating data set
xtr, xte, ytr, yte = tts (xxor, yxor, test_size = 0.3)
# The following is for classifying
svc0 = SVC (C=100.0, kernel='rbf')
svc0.fit (xtr, ytr)
ypd = svc0.predict (xte)
print ("accuracy: ", svc0.score (xte, yte))

pdb (xxor, yxor, classifier=svc0)
## class 1
for ind, val in enumerate (np.unique(y)):
    plt.scatter (x[y==val,0], x[y==val,1],
                 marker = marker_list_all[ind],
                 c = color_list_all2[ind],
                 label='Class '+str(val))
plt.legend (loc = 0)
plt.xlim (x[:,0].min(), x[:,0].max())
plt.ylim (x[:,1].min(), x[:,1].max())
plt.tight_layout ()
pic1 = 'random-forest-scat.pdf'
plt.savefig (pic1)
plt.show ()

# separating data set
xtr, xte, ytr, yte = tts (x, y, test_size = 0.3, random_state=0)
# standarizing the data
sc0 = SC ()
sc0.fit (xtr)
xtr_std = sc0.transform (xtr)
xte_std = sc0.transform (xte)
# The following is for classifying
rfc =  RFC(criterion="entropy",
           n_estimators=50,
           random_state=1)
rfc.fit (xtr_std, ytr)
ypd = rfc.predict (xte_std)
print ("accuracy: ", rfc.score (xte_std, yte))

pdb (x, y, classifier=rfc, standardizer=sc0)
Пример #42
0
d = match_data.drop('winner', axis=1).values

for i in d:
    home = i[1]
    away = i[0]
    great_match_arrays.append(np.hstack([i, team_standings.loc[(team_standings['team.ID'] == home)].values.flatten(), team_standings.loc[(team_standings['team.ID'] == away)].values.flatten()]))
final_data = np.stack(great_match_arrays)

print(final_data.shape)

target = match_data['winner'].values
pd.DataFrame(target).to_csv('target.csv')

target = match_data['winner'].values

X_train, X_test, y_train, y_test = tts(final_data, target, train_size=0.75, test_size=0.25, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# NOTE: Make sure that the class is labeled 'target' in the data file

# Score on the training set was:0.6928853754940711
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=RandomForestClassifier(bootstrap=True, criterion="gini", max_features=0.5, min_samples_leaf=20, min_samples_split=5, n_estimators=100)),
    DecisionTreeClassifier(criterion="gini", max_depth=6, min_samples_leaf=16, min_samples_split=15)
)

exported_pipeline.fit(X_train, y_train)
results = exported_pipeline.predict(final_data)

def predict_game(away, home):
Пример #43
0
def _make_dataset(X, y, split=False):
    if split:
        X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
        return Dataset(Split(X_train, X_test), Split(y_train, y_test))
    return Dataset(X, y)
Пример #44
0
def load_data(filepath):
    data = pd.read_csv(filepath)
    labels = data.iloc[:, 0]
    data = data.iloc[:, 1:]

    cat_cols_idx = sorted([data.columns.get_loc(c) for c in categorical_cols])

    d_train, d_test, y_train, y_test = tts(data, labels,
                                           test_size=0.3,
                                           random_state=42)
    test_idx = list(d_test.index.values)

    print('Generating oversampled datasets...')
    # SMOTE-NC Before
    d_train_b, y_train_b = SMOTENC(
        categorical_features=cat_cols_idx,
        k_neighbors=5, random_state=42).fit_resample(data, labels)

    d_train_b = np.delete(d_train_b, test_idx, axis=0)
    y_train_b = np.delete(y_train_b, test_idx, axis=0)

    d_test_b = deepcopy(d_test)

    # SMOTE-NC After
    d_train_a, y_train_a = SMOTENC(
        categorical_features=cat_cols_idx,
        k_neighbors=5, random_state=42).fit_resample(d_train, y_train)

    d_test_a = deepcopy(d_test)

    # Scale numeric features only
    print('Scaling numeric features...')
    scaler = StandardScaler()
    for i in range(d_train.shape[1]):
        col = data.columns[i]
        if col in categorical_cols:
            continue

        # Original
        d_train[[col]] = scaler.fit_transform(d_train[[col]])
        d_test[[col]] = scaler.transform(d_test[[col]])

        # SMOTE-NC Before
        d_train_b[:, i] = np.ravel(scaler.fit_transform(
            d_train_b[:, i].reshape(-1, 1)))

        d_test_b[[col]] = scaler.transform(d_test_b[[col]])

        # SMOTE-NC After
        d_train_a[:, i] = np.ravel(scaler.fit_transform(
            d_train_a[:, i].reshape(-1, 1)))

        d_test_a[[col]] = scaler.transform(d_test_a[[col]])

    # Original
    train_ldr = td.DataLoader(utils.ClockDrawingDataset(d_train, y_train),
                              batch_size=10,
                              shuffle=True,
                              num_workers=0)
    test_ldr = td.DataLoader(utils.ClockDrawingDataset(d_test, y_test),
                             batch_size=10,
                             shuffle=False,
                             num_workers=0)

    # SMOTE-NC Before
    train_ldr_b = td.DataLoader(utils.ClockDrawingDataset(d_train_b,
                                                            y_train_b),
                                batch_size=10,
                                shuffle=True,
                                num_workers=0)
    test_ldr_b = td.DataLoader(utils.ClockDrawingDataset(d_test_b,
                                                           y_test),
                               batch_size=10,
                               shuffle=False,
                               num_workers=0)

    # SMOTE-NC After
    train_ldr_a = td.DataLoader(utils.ClockDrawingDataset(d_train_a,
                                                            y_train_a),
                                batch_size=10,
                                shuffle=True,
                                num_workers=0)
    test_ldr_a = td.DataLoader(utils.ClockDrawingDataset(d_test_a, y_test),
                               batch_size=10,
                               shuffle=False,
                               num_workers=0)

    return [train_ldr, train_ldr_b, train_ldr_a], \
           [test_ldr, test_ldr_b, test_ldr_a]
from sklearn.datasets import load_digits, load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split as tts

from yellowbrick.classifier import ConfusionMatrix


if __name__ == '__main__':
    digits = load_digits()
    digit_X = digits.data
    digit_y = digits.target
    d_X_train, d_X_test, d_y_train, d_y_test = tts(
        digit_X, digit_y, test_size=0.2
    )
    model = LogisticRegression()
    digit_cm = ConfusionMatrix(model, classes=[0,1,2,3,4,5,6,7,8,9])
    digit_cm.fit(d_X_train, d_y_train)
    digit_cm.score(d_X_test, d_y_test)
    d = digit_cm.poof(outpath="images/confusion_matrix_digits.png")


    iris = load_iris()
    iris_X = iris.data
    iris_y = iris.target
    iris_classes = iris.target_names
    i_X_train, i_X_test, i_y_train, i_y_test = tts(
        iris_X, iris_y, test_size=0.2
    )
    model = LogisticRegression()
    iris_cm = ConfusionMatrix(
        model, classes=iris_classes,
Пример #46
0
def build_and_evaluateSVM(X,
                          y,
                          n=None,
                          classifier=svm.SVC,
                          outpath=None,
                          verbose=True):
    """
    Builds a classifer for the given list of documents and targets in two
    stages: the first does a train/test split and prints a classifier report,
    the second rebuilds the model on the entire corpus and returns it for
    operationalization.
    X: a list or iterable of raw strings, each representing a document.
    y: a list or iterable of labels, which will be label encoded.
    Can specify the classifier to build with: if a class is specified then
    this will build the model with the Scikit-Learn defaults, if an instance
    is given, then it will be used directly in the build pipeline.
    If outpath is given, this function will write the model as a pickle.
    If verbose, this function will print out information to the command line.
    """
    @timeit
    def build(classifier, X, y=None):
        """
        Inner build function that builds a single model.
        """
        if isinstance(classifier, type):
            classifier = classifier(kernel='rbf')

        gridsearch_pipe = Pipeline([
            # ('preprocessor', TextNormalizer_lemmatize()),
            ('vectorizer',
             TfidfVectorizer(tokenizer=identity,
                             preprocessor=None,
                             lowercase=False,
                             ngram_range=(1, 2))),
            ('classifier', classifier),
        ])

        # maxdf = [0.85, 0.90, 0.95]
        # mindf = (4, 3, 2)
        # nfeat = [12000, 12500, 13000]
        # ngrams = [(1, 1), (1, 2), (1,3)]
        # # Cs = [0.001, 0.01, 0.1, 1, 10]
        # # gammas = [0.001, 0.01, 0.1, 1]
        # param_grid = {
        #     # 'classifier__C': Cs, 'classifier__gamma' : gammas,
        #     'vectorizer__max_df':maxdf, 'vectorizer__min_df':mindf, 'vectorizer__ngram_range':ngrams, 'vectorizer__max_features':nfeat
        #     }
        # grid_search = GridSearchCV(gridsearch_pipe, param_grid, cv=10)
        # grid_search.fit(X, y)
        # best_param = grid_search.best_params_
        # print(best_param)

        # vectorizer = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False,
        # max_df=best_param['vectorizer__max_df'], min_df=best_param['vectorizer__min_df'],
        # ngram_range=best_param['vectorizer__ngram_range'], max_features=best_param['vectorizer__max_features'])
        # classifier = svm.SVC(kernel='rbf', C=best_param['classifier__C'], gamma=best_param['classifier__gamma'])

        vectorizer = TfidfVectorizer(tokenizer=identity,
                                     preprocessor=None,
                                     lowercase=False,
                                     ngram_range=(1, 2),
                                     max_features=12000,
                                     max_df=0.85,
                                     min_df=4)
        classifier = svm.SVC(kernel='rbf', C=10, gamma=1)

        model = Pipeline([
            # ('preprocessor', TextNormalizer_lemmatize()),
            ('vectorizer', vectorizer),
            ('classifier', classifier),
        ])
        model.fit(X, y)

        return model

    # Label encode the targets
    labels = LabelEncoder()
    y = labels.fit_transform(y)

    # Begin evaluation
    if n:
        if verbose: print("splitting test and test set by: " + str(n))
        n_samples = len(y)
        indicies = np.arange(n_samples)
        X_train, X_test, y_train, y_test, idx_train, idx_test = tts(
            X, y, indicies, test_size=n, stratify=y)
        # X_train, X_test, y_train, y_test = X[:n], X[n:], y[:n], y[n:]
        print(len(X_train), len(X_test))
        from collections import Counter
        print(Counter(y_train))

        model, secs = build(classifier, X_train, y_train)
        model.labels_ = labels

        if verbose:
            print("Evaluation model fit in {:0.3f} seconds".format(secs))
        y_pred = model.predict(X_test)

        if verbose: print("Classification Report:\n")
        print(clsr(y_test, y_pred, target_names=labels.classes_))
        print(cm(y_test, y_pred))
        print('acc', accuracy_score(y_test, y_pred))
        print('f1', f1_score(y_test, y_pred, average='weighted'))

    else:
        if verbose: print("Building for evaluation with full set")
        model, secs = build(classifier, X, y)
        model.labels_ = labels

        if verbose:
            print("Evaluation model fit in {:0.3f} seconds".format(secs))
        y_pred = model.predict(X)

        if verbose: print("Classification Report:\n")
        print(clsr(y, y_pred, target_names=labels.classes_))
        print(cm(y, y_pred))
        print(accuracy_score(y, y_pred))

    if verbose: print("Evaluation of naive prediction ...")
    y_naive = [0] * len(y_test)
    print(type(y_test))
    print('acc naive', accuracy_score(y_test, y_naive))

    if verbose: print("Complete model fit in {:0.3f} seconds".format(secs))

    if outpath:
        with open(outpath, 'wb') as f:
            pickle.dump(model, f)

        print("Model written out to {}".format(outpath))

    return model, y_pred, idx_test
Пример #47
0
tfv = TfidfVectorizer(min_df=1, stop_words='english')

data = pd.read_csv('BankFAQs.csv')
questions = data['Question'].values

X = []
for question in questions:
    X.append(cleanup(question))

tfv.fit(X)
le.fit(data['Class'])

X = tfv.transform(X)
y = le.transform(data['Class'])

trainx, testx, trainy, testy = tts(X, y, test_size=.25, random_state=42)

model = SVC(kernel='linear')
model.fit(trainx, trainy)
print("SVC:", model.score(testx, testy))


def get_max5(arr):
    ixarr = []
    for ix, el in enumerate(arr):
        ixarr.append((el, ix))
    ixarr.sort()

    ixs = []
    for i in ixarr[-5:]:
        ixs.append(i[1])
Пример #48
0
    # 合并数据
    if i_month == para.month_in_sample[0]:    # 第一个月
        data_in_sample = data_curr_month    # 定义样本空间数据框并从第一个月开始填入数据
    else:
        data_in_sample = data_in_sample.append(data_curr_month)    # 除了第一个月以外,都从最右列后面开始填入数据

#%% 数据预处理
# 将样本内集合切分成训练集和交叉验证集,并通过主成分分析进行降维以及去除因子共线性。

# 取样本空间
X_in_sample = data_in_sample.loc[:, 'EP':'bias']    # 切片:所有行,70个因子所有列 (##列重名怎么办?)
Y_in_sample = data_in_sample.loc[:, 'return_bin']    # 切片:所有行,labal 列

# 将样本空间随机切分为训练集和交叉验证集
X_train, X_cv, Y_train, y_cv = tts(X_in_sample, Y_in_sample, test_size=para.percent_cv, random_state=para.seed)

# PCA
pca = decomposition.PCA(n_components=0.95)    # n_components 为 0~1间浮点数表示,PCA模型取该比例主成分数量;为大于1整数时表示取前几个主成分
pca.fit(X_train)    # 对训练集进行主成分分析拟合
X_train = pca.transform(X_train)    # 根据训练好的 pca 模型,对训练集进行主成分分析转换
X_cv = pca.transform(X_cv)    # 根据训练好的 pca 模型,对交叉验证集进行主成分分析转换

# 数据标准化
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_cv = scaler.transform(X_cv)

#%% 核心模型设置

if para.method == 'SVM':
Пример #49
0
from build import models, reader
from build import labels as categories
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import classification_report

docs = reader.fileids(categories=categories)
labels = [reader.categories(fileids=[fid])[0] for fid in docs]

train_docs, test_docs, train_labels, test_labels = tts(docs, labels, test_size=0.2)

def get_docs(fids):
    for fid in fids:
        yield list(reader.docs(fileids=[fid]))

sgd = models[3]
nby = models[4]


sgd.fit(get_docs(train_docs), train_labels)
y_pred = sgd.predict(get_docs(test_docs))

print(classification_report(test_labels, y_pred, labels=categories))


import nltk

def preprocess(text):
    return [
        [
            list(nltk.pos_tag(nltk.word_tokenize(sent)))
            for sent in nltk.sent_tokenize(para)
n_users = len(ratings_df_sample['userId'].unique())
n_movies = len(ratings_df_sample['movieId'].unique())
(n_users, n_movies)

movie_ids = ratings_df_sample['movieId'].unique()


def scale_movie_id(movie_id):
    scaled = np.where(movie_ids == movie_id)[0][0] + 1
    return scaled

ratings_df_sample['movieId'] = ratings_df_sample['movieId'].apply(scale_movie_id)
ratings_df_sample.head()

train_data, test_data = tts(ratings_df_sample, test_size=0.2)

print('Train shape: {}'.format(train_data.shape))
print('Test shape: {}'.format(test_data.shape))


def rmse(prediction, ground_truth):
    prediction = np.nan_to_num(prediction)[ground_truth.nonzero()].flatten()
    ground_truth = np.nan_to_num(ground_truth)[ground_truth.nonzero()].flatten()
    
    mse = mean_squared_error(prediction, ground_truth)
    return sqrt(mse)
train_data_matrix = np.zeros((n_users, n_movies))
for line in train_data.itertuples():
    train_data_matrix[line[1] - 1, line[2] - 1] = line[3]
    
Пример #51
0
        for entry in dic:
            data.append(palvrs.count(entry[0]))
        feature_set.append(data)

        if "ham" in email:
            labels.append(0)
        if "spam" in email:
            labels.append(1)
        c = c - 1
    return feature_set, labels


d = construir_dicionario()
features, labels = criar_dataset(d)

print(" Dividindo nosso dataset entre treino e teste, 80% treino, 20% teste,")
print(" utilizando as features e labels de treino e de teste...\n")
x_train, x_test, y_train, y_test = tts(features, labels, test_size=0.2)

print(
    " Criando o modelo de Machine Learning com base no dataset de treino...\n")
clf = MultinomialNB()
clf.fit(x_train, y_train)

print(" Realizando predicao para testar o modelo que acabamos de criar...\n")
pred = clf.predict(x_test)
print(" Pontuacao de acuracia atingida pelo nosso modelo: ", end='')
print(accuracy_score(y_test, pred))
print("\n Salvando modelo em forma de arquivo para uso posterior.\n")
salvar_modelo(clf, "text-classifier.mdl")
Пример #52
0
    #print("this is labels \n",labels)
    return feature_set, labels


# In[5]:

d = make_dict()
features, labels = make_dataset(d)

# In[6]:

#import model_selection to split the datasets into train and test sets
from sklearn.model_selection import train_test_split as tts
x_train, x_test, y_train, y_test = tts(
    features, labels,
    test_size=0.3)  # 80% data is used to train and 20 % for testing

# In[10]:

# training the model with naive_bayes classifier
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
clf1 = MultinomialNB()
clf1.fit(x_train, y_train)

# Predicting the accuracy of the naive_bayes classifier
from sklearn.metrics import accuracy_score
predict1 = clf1.predict(x_test)
print(accuracy_score(y_test, predict1))
print(" Confusion matrix ", confusion_matrix(y_test, predict1))
Пример #53
0
def main():
    if len(sys.argv)!=4:
        print 'USAGE:'
        print 'python -m scoop devel.py [cloneID] [clusterDir] [outputDir]'
        print 'see devel_config.py'
        return

    cloneID = sys.argv[1]
    clusterDir = sys.argv[2]; assert clusterDir[-1]=='/',"should be ended with '/'"
    baseOutDir = sys.argv[3]; assert baseOutDir[-1]!='/',"should NOT be ended with '/'"

    clfParam = None
    method = cfg['method']
    if method=='esvm':
        from esvm_config import config as clfParam
    elif method=='psvm':
        from psvm_config import config as clfParam
    else:
        print 'FATAL: unknown method'
        return

    outDir = os.path.join(baseOutDir,'devel-'+os.path.basename(baseOutDir))
    if not(os.path.isdir(baseOutDir)): os.makedirs(baseOutDir)
    if not(os.path.isdir(outDir)): os.makedirs(outDir)

    ## Load data ###################################################################################
    dataLog = {}; dataLogFpath = os.path.join(outDir,'data_log_'+os.path.basename(baseOutDir)+'.json')
    dataset = clusterDir.split('/')[-2].split('-')[-1]; dataLog['dataset'] = dataset
    datasetParams = dataset.split('#')
    assert datasetParams[0]=='yamanishi'

    xyDevFpath = os.path.join(baseOutDir,'_'.join(['xdev','ydev','xrel','yrel']+datasetParams)+'.h5')
    if os.path.exists(xyDevFpath):
        print 'loading data from PREVIOUS...'

        with h5py.File(xyDevFpath,'r') as f:
            xdev = f['xdev'][:]
            ydev = f['ydev'][:]
            xrel = f['xrel'][:]
            yrel = f['yrel'][:]
            xrelraw = f['xrelraw'][:]

        with open(dataLogFpath,'r') as f:
            dataLog = yaml.load(f)

    else:
        print 'loading data FRESHLY...'

        print 'loading cluster result...'
        nUnlabels = []
        statFnames = [i for i in os.listdir(clusterDir) if 'labels_stat.json' in i]
        for i in statFnames:
            with open(os.path.join(clusterDir,i),'r') as f: stat = yaml.load(f)
            nUnlabels.append(stat['0'])

        # use the cluster with minimum numbers of unlabeled samples
        metric = '_'.join(statFnames[ nUnlabels.index(min(nUnlabels)) ].split('_')[0:2])
        dataLog['metric'] = metric

        connFpath = os.path.join(clusterDir,metric+'_labels.pkl')
        with open(connFpath,'r') as f:
            data = pickle.load(f)

        ##
        print 'getting devel and release data...'
        xraw = []; yraw = []
        for k,v in data.iteritems():
            for vv in v:
                xraw.append(vv)
                yraw.append(k)

        devIdx = [i for i in range(len(xraw)) if yraw[i]!=0]
        xdev = [xraw[i] for i in devIdx]
        ydev = [yraw[i] for i in devIdx]

        relIdx = [i for i in range(len(xraw)) if yraw[i]==0]
        xrel = [xraw[i] for i in relIdx]
        yrel = [yraw[i] for i in relIdx]

        dataLog['nDevel'] = len(devIdx); dataLog['nData'] = len(yraw)
        dataLog['rDevel:Data'] = dataLog['nDevel']/float(dataLog['nData'])
        dataLog['nDevel(+)'] = len( [i for i in ydev if i==1] ); assert dataLog['nDevel(+)']!=0
        dataLog['nDevel(-)'] = len( [i for i in ydev if i==-1] ); assert dataLog['nDevel(-)']!=0
        dataLog['rDevel(+):Devel'] = float(dataLog['nDevel(+)'])/dataLog['nDevel']
        dataLog['rDevel(-):Devel'] = float(dataLog['nDevel(-)'])/dataLog['nDevel']
        dataLog['rDevel(+):(-)'] = float(dataLog['nDevel(+)'])/float(dataLog['nDevel(-)'])
        dataLog['nRelease'] = len(relIdx);
        dataLog['rRelease:Data'] = dataLog['nRelease']/float(dataLog['nData'])

        ##
        print 'loading com, pro feature...'
        krFpath = os.path.join(cfg['datasetDir'],datasetParams[0],'feature',
                               'klekotaroth','klekotaroth-'+datasetParams[1]+'.h5')
        aacFpath = os.path.join(cfg['datasetDir'],datasetParams[0],'feature',
                                'amino-acid-composition','amino-acid-composition-'+datasetParams[1]+'.h5')

        krDict = {}; aacDict = {}
        with h5py.File(krFpath, 'r') as f:
            for com in [str(i) for i in f.keys()]:
                krDict[com] = f[com][:]
        with h5py.File(aacFpath, 'r') as f:
            for pro in [str(i) for i in f.keys()]:
                aacDict[pro] = f[pro][:]
                # aacDict[pro] = list( fu.map(lambda x: float('%.2f'%(x)),f[pro][:]) ) # rounding

        comFeaLenOri = len(krDict.values()[0])
        proFeaLenOri = len(aacDict.values()[0])

        ##
        msg = 'extract (com,pro) feature... dims: '+str(comFeaLenOri)+','+str(proFeaLenOri)
        msg += ' of '+str(len(ydev))+' and '+str(len(yrel))
        print msg

        sh.setConst(krDict=krDict)
        sh.setConst(aacDict=aacDict)
        xdevf = list( fu.map(cutil.extractComProFea,xdev) )
        xrelf = list( fu.map(cutil.extractComProFea,xrel) )

        ##
        xyDevList = cutil.divideSamples(xdevf,ydev,cfg['smoteBatchSize'])
        if cfg['maxNumberOfSmoteBatch'] != 0:
            xyDevList = xyDevList[0:cfg['maxNumberOfSmoteBatch']]

        smoteSeed = util.seed(); dataLog['smoteSeed'] = smoteSeed
        sh.setConst(smoteSeed=smoteSeed)

        print 'resampling via Smote FRESHLY... '+str(len(xyDevList))+' smote(s)'+' on '+str(len(ydev))
        smoteTic = time.time()

        xdevfr = []; ydevr = []
        xydevfrList = list( fu.map(ensembleSmote,xyDevList) )
        for xdevfri,ydevri in xydevfrList:
            for x in xdevfri: xdevfr.append(x.tolist())
            for y in ydevri: ydevr.append(y)
        assert len(xdevfr)==len(ydevr),'len(xdevfr)!=len(ydevr)'

        dataLog['nSmote'] = len(xyDevList)
        dataLog['nDevelResampled'] = len(ydevr)
        dataLog['rDevelResampled:Data'] = dataLog['nDevelResampled']/float(dataLog['nData'])
        dataLog['nDevelResampled(+)'] = len( [i for i in ydevr if i==1] )
        dataLog['nDevelResampled(-)'] = len( [i for i in ydevr if i==-1] )
        dataLog['rDevelResampled(+):DevelResampled'] = dataLog['nDevelResampled(+)']/float(dataLog['nDevelResampled'])
        dataLog['rDevelResampled(-):DevelResampled'] = dataLog['nDevelResampled(-)']/float(dataLog['nDevelResampled'])
        dataLog['rDevelResampled(+):(-)'] = dataLog['nDevelResampled(+)']/float(dataLog['nDevelResampled(-)'])
        dataLog['timeSMOTE'] =  str(time.time()-smoteTic)

        ##
        print 'update xdev,ydev,xrel... '+str(np.asarray(xdevfr).shape)
        xrelraw = xrel[:] # raw: feature is NOT extracted
        xrel = xrelf[:]
        xdev = xdevfr[:]
        ydev = ydevr[:]

        print 'writing updated xdev,ydev and xrel,yrel...'
        with h5py.File(xyDevFpath,'w') as f:
            f.create_dataset('xdev',data=xdev,dtype=np.float32)
            f.create_dataset('ydev',data=ydev,dtype=np.int8)
            f.create_dataset('xrel',data=xrel,dtype=np.float32)
            f.create_dataset('yrel',data=yrel,dtype=np.int8)
            f.create_dataset('xrelraw',data=xrelraw)

        print 'writing dataLog...'
        dataLog['nCom'] = len(krDict)
        dataLog['nPro'] = len(aacDict)
        with open(dataLogFpath,'w') as f:
            json.dump(dataLog,f,indent=2,sort_keys=True)

    ## TUNE+TRAIN+TEST #############################################################################
    devLog = {}
    devSeed = util.seed(); dataLog['devSeed'] = devSeed
    tag = '_'.join([method+'#'+cloneID,dataset,util.tag()])

    ## split devel dataset
    msg = ' '.join( ['devel',dataset,cloneID])
    xtr,xte,ytr,yte = tts(xdev,ydev,test_size=cfg['testSize'],
                          random_state=devSeed,stratify=ydev)

    if cfg['maxTestingSamples']>0:
        chosenIdx = np.random.randint(len(xte),size=cfg['maxTestingSamples'])
        xte = [xte[i] for i in chosenIdx]; yte = [yte[i] for i in chosenIdx]

    devLog['nTraining'] = len(xtr)
    devLog['nTraining(+)'] = len([i for i in ytr if i==1])
    devLog['nTraining(-)'] = len([i for i in ytr if i==-1])
    devLog['rTraining(+):(-)'] = devLog['nTraining(+)']/float(devLog['nTraining(-)'])
    devLog['rTraining:Devel'] = devLog['nTraining']/float(dataLog['nDevelResampled'])
    devLog['nTesting'] = len(xte)
    devLog['nTesting(+)'] = len([i for i in yte if i==1])
    devLog['nTesting(-)'] = len([i for i in yte if i==-1])
    devLog['rTesting(+):(-)'] = devLog['nTesting(+)']/float(devLog['nTesting(-)'])
    devLog['rTesting:Devel'] = devLog['nTesting']/float(dataLog['nDevelResampled'])

    ## tuning
    clf = None
    if method=='esvm':
        clf  = eSVM(simMat=None)
    elif method=='psvm':
        clf = svm.SVC(kernel=clfParam['kernel'],probability=True)

    ## training
    print msg+': fitting nTr= '+str(len(ytr))
    trTic = time.time()

    if method=='esvm':
        clf.fit(xtr,ytr)
        devLog['labels'] = clf.labels()
        devLog['nSVM'] = clf.nSVM()
        devLog['xtrDimAllBatches'] = clf.xtrDimAllBatches()
    elif method=='psvm':
        if cfg['method']['kernel']=='precomputed':
            assert False
            # simMatTr = cutil.makeComProKernelMatFromSimMat(xtr,xtr,simMat)
            # clf.fit(simMatTr,ytr)
        else:
            clf.fit(xtr,ytr)
        devLog['labels'] = clf.classes_.tolist()
    devLog['timeTraining'] = str(time.time()-trTic)

    ## testing
    print msg+': predicting nTe= '+str(len(yte))
    teTic = time.time()

    if method=='esvm':
        ypred,yscore = clf.predict(xte)
    elif method=='psvm':
        if cfg['method']['kernel']=='precomputed':
            assert False
            # simMatTe = cutil.makeComProKernelMatFromSimMat(xte,xtr,simMat)
            # ypred = clf.predict(simMatTe)
            # yscore = clf.predict_proba(simMatTe)
        else:
            ypred = clf.predict(xte)
            yscore = clf.predict_proba(xte)
            yscore = [max(i.tolist()) for i in yscore]
    devLog['timeTesting'] = str(time.time()-teTic)

    ## TEST RELEASE ################################################################################
    print msg+': predicting RELEASE n= '+str(len(yrel))
    relTic = time.time()

    if method=='esvm':
        yrel,yrelscore = clf.predict(xrel)
    elif method=='psvm':
        if cfg['method']['kernel']=='precomputed':
            assert False
            # simMatTe = cutil.makeComProKernelMatFromSimMat(xrel,xtr,simMat)
            # yrel = clf.predict(simMatTe)
            # yrelscore = clf.predict_proba(simMatTe)
        else:
            yrel = clf.predict(xrel)
            yrelscore = clf.predict_proba(xrel)
            yrelscore = [max(i.tolist()) for i in yrelscore]
    devLog['timeRelease'] = str(time.time()-relTic)

    ## WRITE RESULT ################################################################################
    result = {'yte':yte,'ypred':ypred,'yscore':yscore,
              'xrelraw':xrelraw,'yrel':yrel,'yrelscore':yrelscore}

    print 'writing prediction...'
    with h5py.File(os.path.join(outDir,'result_'+tag+'.h5'),'w') as f:
        for k,v in result.iteritems():
            if 'raw' in k:
                f.create_dataset(k,data=v)
            else:
                dt = np.int8
                if 'score' in k: dt = np.float32
                f.create_dataset(k,data=v,dtype=dt)

    ##
    print 'writing devLog...'
    devLog['clfParam'] = clfParam
    devLog['devParam'] = cfg
    with open(os.path.join(outDir,'devLog_'+tag+'.json'),'w') as f:
        json.dump(devLog,f,indent=2,sort_keys=True)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
california_housing_dataframe = pd.read_csv("https://download.mlcc.google.com/mledu-datasets/california_housing_train.csv", sep=",")
X = california_housing_dataframe[
    ["latitude",
     "longitude",
     "housing_median_age",
     "total_rooms",
     "total_bedrooms",
     "population",
     "households",
     "median_income"]]
y=california_housing_dataframe["median_house_value"] 
X.describe()
X_train,X_test,y_train,y_test= tts(X,y,random_state=0,test_size=0.2)
sc= StandardScaler()
X_train= sc.fit_transform(X_train)
X_test= sc.transform(X_test)
classifier= LogisticRegression(random_state=0)
classifier.fit(X_train,y_train)
y_pred= classifier.predict(X_test)
cm= log_loss(y_test,y_pred)
print(cm)