def test_multi_output_predict_proba():
    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5, tol=1e-3)
    param = {'loss': ('hinge', 'log', 'modified_huber')}

    # inner function for custom scoring
    def custom_scorer(estimator, X, y):
        if hasattr(estimator, "predict_proba"):
            return 1.0
        else:
            return 0.0
    grid_clf = GridSearchCV(sgd_linear_clf, param_grid=param,
                            scoring=custom_scorer, cv=3, error_score=np.nan)
    multi_target_linear = MultiOutputClassifier(grid_clf)
    multi_target_linear.fit(X, y)

    multi_target_linear.predict_proba(X)

    # SGDClassifier defaults to loss='hinge' which is not a probabilistic
    # loss function; therefore it does not expose a predict_proba method
    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5, tol=1e-3)
    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
    multi_target_linear.fit(X, y)
    err_msg = "The base estimator should implement predict_proba method"
    with pytest.raises(ValueError, match=err_msg):
        multi_target_linear.predict_proba(X)
Exemplo n.º 2
0
def test_multi_output_classification():
    # test if multi_target initializes correctly with base estimator and fit
    # assert predictions work as expected for predict, prodict_proba and score

    forest = RandomForestClassifier(n_estimators=10, random_state=1)
    multi_target_forest = MultiOutputClassifier(forest)

    # train the multi_target_forest and also get the predictions.
    multi_target_forest.fit(X, y)

    predictions = multi_target_forest.predict(X)
    assert_equal((n_samples, n_outputs), predictions.shape)

    predict_proba = multi_target_forest.predict_proba(X)

    assert len(predict_proba) == n_outputs
    for class_probabilities in predict_proba:
        assert_equal((n_samples, n_classes), class_probabilities.shape)

    assert_array_equal(np.argmax(np.dstack(predict_proba), axis=1),
                       predictions)

    # train the forest with each column and assert that predictions are equal
    for i in range(3):
        forest_ = clone(forest)  # create a clone with the same state
        forest_.fit(X, y[:, i])
        assert_equal(list(forest_.predict(X)), list(predictions[:, i]))
        assert_array_equal(list(forest_.predict_proba(X)),
                           list(predict_proba[i]))
Exemplo n.º 3
0
def test_multiclass_multioutput_estimator_predict_proba():
    seed = 542

    # make test deterministic
    rng = np.random.RandomState(seed)

    # random features
    X = rng.normal(size=(5, 5))

    # random labels
    y1 = np.array(['b', 'a', 'a', 'b', 'a']).reshape(5, 1)  # 2 classes
    y2 = np.array(['d', 'e', 'f', 'e', 'd']).reshape(5, 1)  # 3 classes

    Y = np.concatenate([y1, y2], axis=1)

    clf = MultiOutputClassifier(LogisticRegression(random_state=seed))

    clf.fit(X, Y)

    y_result = clf.predict_proba(X)
    y_actual = [np.array([[0.23481764, 0.76518236],
                          [0.67196072, 0.32803928],
                          [0.54681448, 0.45318552],
                          [0.34883923, 0.65116077],
                          [0.73687069, 0.26312931]]),
                np.array([[0.5171785, 0.23878628, 0.24403522],
                          [0.22141451, 0.64102704, 0.13755846],
                          [0.16751315, 0.18256843, 0.64991843],
                          [0.27357372, 0.55201592, 0.17441036],
                          [0.65745193, 0.26062899, 0.08191907]])]

    for i in range(len(y_actual)):
        assert_almost_equal(y_result[i], y_actual[i])
Exemplo n.º 4
0
def test_multi_output_exceptions():
    # NotFittedError when fit is not done but score, predict and
    # and predict_proba are called
    moc = MultiOutputClassifier(LinearSVC(random_state=0))
    assert_raises(NotFittedError, moc.predict, y)
    assert_raises(NotFittedError, moc.predict_proba, y)
    assert_raises(NotFittedError, moc.score, X, y)
    # ValueError when number of outputs is different
    # for fit and score
    y_new = np.column_stack((y1, y2))
    moc.fit(X, y)
    assert_raises(ValueError, moc.score, X, y_new)
def test_multiclass_multioutput_estimator():
    # test to check meta of meta estimators
    svc = LinearSVC(random_state=0)
    multi_class_svc = OneVsRestClassifier(svc)
    multi_target_svc = MultiOutputClassifier(multi_class_svc)

    multi_target_svc.fit(X, y)

    predictions = multi_target_svc.predict(X)
    assert_equal((n_samples, n_outputs), predictions.shape)

    # train the forest with each column and assert that predictions are equal
    for i in range(3):
        multi_class_svc_ = clone(multi_class_svc)  # create a clone
        multi_class_svc_.fit(X, y[:, i])
        assert_equal(list(multi_class_svc_.predict(X)), list(predictions[:, i]))
Exemplo n.º 6
0
def test_multi_output_classification_partial_fit_sample_weights():
    # weighted classifier
    Xw = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
    yw = [[3, 2], [2, 3], [3, 2]]
    w = np.asarray([2., 1., 1.])
    sgd_linear_clf = SGDClassifier(random_state=1)
    clf_w = MultiOutputClassifier(sgd_linear_clf)
    clf_w.fit(Xw, yw, w)

    # unweighted, but with repeated samples
    X = [[1, 2, 3], [1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
    y = [[3, 2], [3, 2], [2, 3], [3, 2]]
    sgd_linear_clf = SGDClassifier(random_state=1)
    clf = MultiOutputClassifier(sgd_linear_clf)
    clf.fit(X, y)
    X_test = [[1.5, 2.5, 3.5]]
    assert_array_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
Exemplo n.º 7
0
def test_multi_output_classification_sample_weights():
    # weighted classifier
    Xw = [[1, 2, 3], [4, 5, 6]]
    yw = [[3, 2], [2, 3]]
    w = np.asarray([2., 1.])
    forest = RandomForestClassifier(n_estimators=10, random_state=1)
    clf_w = MultiOutputClassifier(forest)
    clf_w.fit(Xw, yw, w)

    # unweighted, but with repeated samples
    X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
    y = [[3, 2], [3, 2], [2, 3]]
    forest = RandomForestClassifier(n_estimators=10, random_state=1)
    clf = MultiOutputClassifier(forest)
    clf.fit(X, y)

    X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
    assert_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
Exemplo n.º 8
0
        bestcg.append([c,g,score.mean()])

bestCG = pd.DataFrame(bestcg,columns=['c','g','score'])
print("Best score is",max(bestCG["score"]))
bestCG.loc[bestCG["score"]==max(bestCG["score"])]

# transformation
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
StrainX = sc.fit_transform(trainX)
StestX = sc.transform(testX)

# training and fitting model
bii_model = MultiOutputClassifier(OneVsRestClassifier(SVC(C=100.0,gamma=1.0),n_jobs=-1),n_jobs=-1)
bii_Smodel = MultiOutputClassifier(OneVsRestClassifier(SVC(C=100.0,gamma=1.0),n_jobs=-1),n_jobs=-1)
bii_model.fit(trainX,trainy)
bii_Smodel.fit(StrainX,trainy)

# predicting
pred = bii_model.predict(testX)
Spred = bii_Smodel.predict(StestX)

print("Hamming Loss",hamloss(testy,pred))
hamlossL(testy,pred)

print("Exact Match Score",exactmatch(testy, pred))
exactmatchL(testy, pred)

print("Standardized data Hamming Loss",hamloss(testy,Spred))
hamlossL(testy,Spred)
Exemplo n.º 9
0
# Establish a GridSearchCV variable with the classifier and paramter grid
Grid = GridSearchCV(estimator=forest, param_grid=param_grid, cv=5)
Grid2 = GridSearchCV(estimator=forest, param_grid=param_grid, cv=5)

# Fit the GridSearchCV with the training data
Gridfit = Grid.fit(xtrain, ytrain['Green Type'])
Gridfit2 = Grid2.fit(xtrain, ytrain['Fairway Type'])

print("Best Parameters for Predicting Green Turf", Grid.best_params_)
print("Best Parameters for Predicting Fairway Turf", Grid2.best_params_)

# Get the best model, use it in the Multi-Output Classifier, and make predictions
bestforest = Grid.best_estimator_
multi_target = MultiOutputClassifier(bestforest, n_jobs=1)
preds = multi_target.fit(xtrain, ytrain).predict(xtest)

preds = pd.DataFrame(preds, columns=['Green Type', 'Fairway Type'])
print(preds)

# Encode target data and predictions in numerics so they can be plugged into scoring metrics
le = preprocessing.LabelEncoder()
labels = pd.concat([y['Green Type'], y['Fairway Type']], axis=0)
lefit = le.fit(labels)
ymatrix = ytest.copy()
ymatrixp = preds.copy()
ytest['Green Type'] = lefit.transform(ytest['Green Type'])
ytest['Fairway Type'] = lefit.transform(ytest['Fairway Type'])
preds['Green Type'] = lefit.transform(preds['Green Type'])
preds['Fairway Type'] = lefit.transform(preds['Fairway Type'])
import numpy as np
import pandas as pd
from simulations.irs_v2x_simulation import IRSV2XSimulation
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from joblib import dump, load

data = pd.read_csv('data_position_simulation.csv')
irs_antnum = 256

cols_x = [IRSV2XSimulation.COL_POS_X, IRSV2XSimulation.COL_POS_Y]
# IRSV2XSimulation.COL_POS_Z,
# IRSV2XSimulation.COL_SPEED]
cols_y = []
for n in range(irs_antnum):
    cols_y.append(IRSV2XSimulation.COL_PHASE + str(n))
X = data[cols_x].to_numpy()
Y = data[cols_y].to_numpy()
# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

forest = RandomForestClassifier(n_estimators=100, random_state=1)
classifier = MultiOutputClassifier(forest, n_jobs=-1)
classifier.fit(X[0:100, :], Y[0:100, :])
dump(classifier, 'classifier.joblib')
print(classifier.score(X, Y))
Exemplo n.º 11
0
    for j in range(len(i) - 1):
        temp.append(i[j])
    evalX.append(temp)

# The data from your screenshot
#  Q1 Q5c Q5d Q5e Q5f StateMap p_age_group_sdc p_education_sdc, Q4
train_data = np.array(trainSet)
# These I just made up
test_data_x = np.array(testSet)

eval_data_x = np.array(evalX)

x = train_data[:, :8]
y = train_data[:, 8:]
forest = RandomForestClassifier(n_estimators=100, random_state=1)
classifier = MultiOutputClassifier(forest, n_jobs=-1)
classifier.fit(x, y)
pr = classifier.predict(evalX)

print(pr)

for i in pr:
    predict.append(i[0])

error_count = 0

for i in range(len(actual)):
    if actual[i] != predict[i]:
        error_count += 1

print("Precision: ", (593 - error_count) / 593)
Exemplo n.º 12
0
    datasetY.append(indices)#np.array(indices).astype('int'))

mlb = MultiLabelBinarizer()#classes=len(radionuclides))
datasetY = mlb.fit_transform(datasetY)

#datasetX = StandardScaler().fit_transform(datasetX)
X_train, X_test, y_train, y_test = \
    train_test_split(datasetX, datasetY, test_size=.4, random_state=42)

#print(y_train)
#print(type(y_train))
#y_train = y_train.astype('int')
#y_test = y_test.astype('int')

#print(X_train)
#print(y_train)

classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print(score)

# predict
inv = ag.UnstablesInventory(data=[
    (db.getzai(radionuclides[2]), ACTIVITY),
    (db.getzai(radionuclides[0]), ACTIVITY),
    (db.getzai(radionuclides[5]), ACTIVITY),
    (db.getzai(radionuclides[3]), ACTIVITY)
])
hist, _ = lc(inv, spectype=SPECTYPE)
print(classifier.predict([[1 if bin > 0 else 0 for bin in hist ]]))
Exemplo n.º 13
0
earlystopping = EarlyStopping(monitor='val_f1_score',
                              patience=10,
                              verbose=1,
                              mode='max')
checkpoint = ModelCheckpoint(filepath='best.hdf5',
                             verbose=1,
                             save_best_only=True,
                             save_weights_only=True,
                             monitor='val_f1_score',
                             mode='max')

keras_model = KerasClassifier(build_fn=create_model,
                              epochs=20,
                              batch_size=batch_size)
multi_target_forest = MultiOutputClassifier(keras_model, n_jobs=-1)
print("fitting ...")
multi_target_forest.fit(X_train, Y_train)

#    model.load_weights('best.hdf5')
Y_pred = multi_target_forest.predict(test_sequences)
Y_pred_thresh = (Y_pred > thresh).astype('int')

with open(output_path, 'w') as output:
    print('\"id\",\"tags\"', file=output)
    for index, labels in enumerate(Y_pred_thresh):
        labels = [tag_list[i] for i, value in enumerate(labels) if value == 1]
        if len(labels) == 0:
            labels.append(tag_list[np.argmax(Y_pred[index])])
        labels_original = ' '.join(labels)
        print('\"%d\",\"%s\"' % (index, labels_original), file=output)
Exemplo n.º 14
0
# X,y=iris.data,iris.target
# res=OutputCodeClassifier(LinearSVC(random_state=0),code_size=2,random_state=0).fit(X,y).predict(X)
# print(res)


#多输出回归
# from sklearn.datasets import make_regression
# from sklearn.multioutput import MultiOutputRegressor
# from sklearn.ensemble import GradientBoostingRegressor
# X,y=make_regression(n_samples=10,n_targets=3,random_state=1)
# res=MultiOutputRegressor(GradientBoostingRegressor(random_state=0)).fit(X,y).predict(X)
# print(res)


#多输出分类
from sklearn.datasets import make_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
import numpy as np 
X,y1=make_classification(n_samples=10,n_features=100,n_informative=30,n_classes=3,random_state=1)
y2=shuffle(y1,random_state=1)
y3=shuffle(y1,random_state=2)
Y=np.vstack((y1,y2,y3)).T
n_samples,n_features=X.shape
n_outputs=Y.shape[1]
n_classes=3
forest=RandomForestClassifier(n_estimators=100,random_state=1)
multi_target_forest=MultiOutputClassifier(forest,n_jobs=-1)
res=multi_target_forest.fit(X,Y).predict(X)
print(res)
Exemplo n.º 15
0
    y_train_regr = pd.DataFrame()
    y_test_regr = pd.DataFrame()

    for col in y_train.columns.values:
        if 'z' in col:
            y_train_regr[col] = y_train[col]
            y_test_regr[col] = y_test[col]
        else:
            y_train_clf[col] = y_train[col]
            y_test_clf[col] = y_test[col]

    mo_clf = MultiOutputClassifier(rs_clf)

    # Fit the data to the models
    print('Fitting data')
    mo_clf.fit(x_train, y_train_clf)
    rs_regr.fit(x_train, y_train_regr)

    # Print the results of the fit on the test data
    print('Test classification score: %.3f' % mo_clf.score(x_test, y_test_clf))
    print('Test regression R2 score: %.3f' %
          rs_regr.score(x_test, y_test_regr))

    # Plot the decision surfaces of the classifier and regressor
    x = pd.DataFrame(np.linspace(0, 5, 25))
    y = pd.DataFrame(np.linspace(0, 5, 25))

    # Create a grid to plot our predicted values over
    surf_x = pd.DataFrame(np.array(np.meshgrid(
        x,
        y,
Exemplo n.º 16
0
    refit=True,
    cv=3,
    random_state=1,
    return_train_score=True)

fit_model(random_forest_Bayes_optimized_classifier, X_train, y_train, X_test)
print(random_forest_Bayes_optimized_classifier.best_estimator_)

#Show Confusion Matrix
random_forest_optim = MultiOutputClassifier(
    RandomForestClassifier(n_estimators=2000,
                           max_depth=20,
                           min_samples_split=20,
                           min_samples_leaf=4,
                           max_features='auto'))
classifier = random_forest_optim.fit(X_train, y_train)
cm = multilabel_confusion_matrix(y_test, random_forest_optim.predict(X_test))
print(cm)

## Retrain best model on full dataset and fit to test_set_features
random_forest_optim.fit(scaled_training_features, training_set_labels)
preds = random_forest_optim.predict_proba(scaled_test_features)

## Format for submittal on DrivenData
#Code copied from DrivenData to ensure correct format for submittal

# Save predictions to submission data frame
submission_format["h1n1_vaccine"] = preds[0][:, 1]
submission_format["seasonal_vaccine"] = preds[1][:, 1]

print(submission_format.head())
Exemplo n.º 17
0
    saen = StackedAutoEncoder(28*28, (400, ), tdata)
    ret = saen.fit_transform(tdata)

    #import matplotlib.pyplot as plt
    #graphs, axes = plt.subplots(nrows=5, ncols=5)
    #axes = axes.flatten()

    #for t in range(25):
    #    axes[t].imshow(ret[t,:].reshape((28,28)),  cmap='gray')
    #    graphs.tight_layout()
    #plt.show()
    
    print("Training Model1 ...")
    model1 = MultiOutputClassifier(sklearn.ensemble.RandomForestClassifier(10))
    model1.fit(saen.transform(tdata), ldata)

    print("Training Model2 ...")
    model2 = MultiOutputClassifier(sklearn.ensemble.RandomForestClassifier(10))
    model2.fit(tdata, ldata)
    
    ttest = DataReader.ImageReader("../dataset/t10k-images-idx3-ubyte.gz").to_tensor()
    ltest = DataReader.LabelReader("../dataset/t10k-labels-idx1-ubyte.gz").to_tensor()
    
    pred1 = model1.predict(saen.transform(ttest))
    pred2 = model2.predict(ttest)

    accu1 = accuracy_score(pred1, ltest)
    accu2 = accuracy_score(pred2, ltest)

    print("Model 1 Accuracy (with SEAN): %.3f, Model 2 Accuracy: %.3f" %(accu1, accu2))
Exemplo n.º 18
0
    	return -log(yHat)
    else:
    	return -log(1 - yHat)

#Binary cross entropy
def binary_cross_entropy(actual, predicted):
	sum_score = 0.0
	for i in range(len(actual)):
		sum_score += actual[i] * log(1e-15 + predicted[i])
		mean_sum_score = 1.0 / len(actual) * sum_score
		return -mean_sum_score
print(binary_cross_entropy([1, 0, 1, 0], [1, 1, 1, 0]))


#Multi-classification
from sklearn.datasets import make_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle

X, y1 = make_classification(n_samples=10, n_features=100, n_informative=30, n_classes=3, random_state=1)
y2 = shuffle(y1, random_state=1)
y3 = shuffle(y1, random_state=2)
Y = np.vstack((y1, y2, y3)).T
n_samples, n_features = X.shape 
n_outputs = Y.shape[1] 
n_classes = 3
forest = RandomForestClassifier(random_state=1)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)
print(multi_target_forest.fit(X, Y).predict(X))
Exemplo n.º 19
0
df["reduced_text"] = df["text"].apply(lambda x: re.sub(
    r"""[\d\n!@#$%^&*()_\-=+/,<>?;:"[\]{}`~]""", " ", x.lower()))
df["reduced_text"] = df["reduced_text"].apply(
    lambda x: re.sub("[\.']", "", x.lower()))

# ===========================================
# Baseline model, linear multi-label classifier, Bag of Words
# Let's try to beat the most basic linear model using keras
# ===========================================

lin_cv = CountVectorizer(min_df=10, max_df=0.9)
X_bow = lin_cv.fit_transform(df["reduced_text"])
Xl_train, Xl_test, y_train, y_test = get_data_splits(X_bow, y)

bm = MultiOutputClassifier(SGDClassifier())
bm.fit(Xl_train, np.array(y_train.todense()) * 1)

Xl_pred = bm.predict(Xl_test)
score_lin = f1_score(y_test, Xl_pred, average="micro")
print("Linear score", score_lin)  # 82.5%
print_confusion_matrix_sample(y_test, Xl_pred, 0)

# ===========================================
# NN model, Bag of Words
# ===========================================
X_train, X_test, y_train, y_test = get_data_splits(X_bow, y)
model = nn_dense((1000, 0.5, 800, 0.5), X_train.shape[1], y_train.shape[1])
model.fit(X_train, y_train, batch_size=500, epochs=64, verbose=True)
y_basicnn_pred = model.predict(X_test)
bestscore_basic = f1_score(y_test, y_basicnn_pred > 0.5, average="micro")
print("Linear score", score_lin)  # 82.5%
class Classifier:
    def __init__(self):
        self.REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
        self.BAD_SYMBOLS_RE = re.compile('[^\w\s]')
        self.STOPWORDS = set(stopwords.words('spanish'))
        self.tokenizer = None
        self.multilabel_binarizer = MultiLabelBinarizer()
        self.model = None
        self.maxlen = 100


    def clean_text(self, text):
        text = text.lower() # lowercase text
        text = self.REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
        text = self.BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    #    text = re.sub(r'\W+', '', text)
        text = ' '.join(word for word in text.split() if word not in self.STOPWORDS) # remove stopwors from text
        return text

    def clean_text_in_tags(self, tags):
        clean_tags = []
        for tag in tags:
            clean_tags = clean_tags + [self.clean_text(tag)]
        return clean_tags
        

    def clean_news(self, df):
        print("cleaning the text data")
        df = df.reset_index(drop=True)
        df.dropna(subset=['tags'], inplace=True)
        df['tags'] = df['tags'].apply(self.clean_text_in_tags)
        df['content'] = df['content'].apply(self.clean_text)
        df['content'] = df['content'].str.replace('\d+', '')
        return df

    def create_tags_and_multilabel_biniarizer(self, df):
        print("creating tags and tag index for classes")
        y = self.multilabel_binarizer.fit_transform(df.tags)
        # Serialize both the pipeline and binarizer to disk.
        with open('../data/neural_network_config/multilabel_binarizer.pickle', 'wb') as f:
            pickle.dump((self.multilabel_binarizer), f, protocol=pickle.HIGHEST_PROTOCOL)
        return y
        
    def load_tokenizer(self, sentences):
        print("loading toikenizer")
        self.tokenizer = Tokenizer(num_words=5000)
        self.tokenizer.fit_on_texts(sentences)

        # saving tokenizer
        with open('../data/neural_network_config/tokenizer.pickle', 'wb') as handle:
            pickle.dump(self.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def create_train_and_test_data(self, sentences, y):
        print("separating data into test data and train data")
        sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

        X_train = self.tokenizer.texts_to_sequences(sentences_train)
        X_test = self.tokenizer.texts_to_sequences(sentences_test)

        X_train = pad_sequences(X_train, padding='post', maxlen=self.maxlen)
        X_test = pad_sequences(X_test, padding='post', maxlen=self.maxlen)
        return X_train, X_test, y_train, y_test

    def create_model(self):
        print("creating model")

    
        #self.model = Sequential()
        #self.model.add(Embedding(vocab_size, 20, input_length=self.maxlen))
        #self.model.add(Dropout(0.1))
        #self.model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
        #self.model.add(GlobalMaxPool1D())
        #self.model.add(Dense(output_size))
        #self.model.add(Activation('sigmoid'))
        #self.model.compile(optimizer=Adam(0.015), loss='binary_crossentropy', metrics=['categorical_accuracy'])
        
        self.model = MultiOutputClassifier(KNeighborsClassifier())

    def save_model(self):
        print("saving model")
        # saving model
        dump(self.model, '../data/neural_network_config/model-sklearn-kneighbors.joblib') 
    
    def create_and_train_model(self):
        filename = "../data/json_news_tagged_bundle/clean_data-unified-tags.json"
        df = pd.read_json(filename)
        df = self.clean_news(df)

        y = self.create_tags_and_multilabel_biniarizer(df)
        sentences = df['content'].values


        self.load_tokenizer(sentences)

        X_train, X_test, y_train, y_test = self.create_train_and_test_data(sentences, y)

        self.create_model()
        
        history = self.model.fit(X_train, y_train)

        print(history)

        self.save_model()
Exemplo n.º 21
0
# Split Train/Test
###############################################################################
(inputs, outputs) = (DATA[FEATS], DATA[['CPT', 'WOP']])
(TRN_X, VAL_X, TRN_Y, VAL_Y) = train_test_split(
    inputs, outputs, 
    test_size=float(VT_SPLIT),
    stratify=outputs
)
(TRN_L, VAL_L) = [i.shape[0] for i in (TRN_X, VAL_X)]
###############################################################################
# Define Model
###############################################################################
rf = RandomForestClassifier(
    n_estimators=TREES, max_depth=DEPTH, criterion='entropy',
    min_samples_split=5, min_samples_leaf=50,
    max_features=None, max_leaf_nodes=None,
    n_jobs=JOB
)
clf = MultiOutputClassifier(rf)
# K-fold training -------------------------------------------------------------
kScores = cross_val_score(clf, TRN_X, TRN_Y)
kScores
###############################################################################
# Train Model
###############################################################################
clf.fit(TRN_X, TRN_Y)
# Predict ---------------------------------------------------------------------
PRD_Y = clf.predict(VAL_X)
clf.score(VAL_X, VAL_Y)

Exemplo n.º 22
0
def pedicting_tag(request):
    print 'inside predicting tag'
    class lemmatokenizer(object):
        def __init__(self):
            self.stemmer = SnowballStemmer('english')
            self.token_pattern = r"(?u)\b\w\w+\b"       
    #         self.wnl = WordNetLemmatizer()
        def __call__(self,doc):                                                     # here, doc is one string sentence
            token_pattern = re.compile(self.token_pattern)
            return [self.stemmer.stem(t) for t in token_pattern.findall(doc)]       # return lambda doc: token_pattern.findall(doc) 
    #         return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]


    vect_title = CountVectorizer(max_df=0.5,min_df=5,stop_words='english',tokenizer=lemmatokenizer(),ngram_range=(1,3))


    # In[9]:

    tfidf_vect_title = TfidfVectorizer(smooth_idf=False,max_df=0.5,min_df=5,stop_words='english',tokenizer=lemmatokenizer(),ngram_range=(1,3))


    le = preprocessing.LabelEncoder()  
    le.fit(y_labels) 
    d_set['label_num'] = pd.Series([le.transform(ast.literal_eval(i)) for i in d_set['tag']])
    d_set.head()


    new_y_labels = d_set['label_num'].values.tolist()

    mlb = MultiLabelBinarizer() 
    mlb.fit(new_y_labels)

    y_tag_dtm = mlb.transform(new_y_labels) 

    y_tag_dtm.shape


    # In[14]:

    X_labels = d_set['title'].values.tolist()

    # print (X_labels)


    # In[15]:

    vect_title.fit(X_labels)
    X_title_dtm = vect_title.transform(X_labels)

    X_title_dtm


    from sklearn.decomposition import PCA

    pca = PCA(n_components=100).fit(X_title_dtm.toarray())
    pca_samples = pca.transform(X_title_dtm.toarray())

    pca_df = pd.DataFrame(np.round(pca_samples,4))

    print (pca_df.head())


    # In[ ]:




    # In[17]:

    new_df = pd.DataFrame(X_title_dtm.toarray(),columns=vect_title.get_feature_names())



    new_df.shape



    d = collections.Counter(vect_title.get_feature_names())

    new_df['target_list'] = [i for i in y_tag_dtm] 


    tfidf_vect_title.fit(X_labels)
    X_title_dtm_tfidf = tfidf_vect_title.transform(X_labels)

    X_title_dtm_tfidf


    # In[23]:

    new_df_of_tfidf = pd.DataFrame(X_title_dtm_tfidf.toarray(),columns=tfidf_vect_title.get_feature_names()) 


    # In[24]:

    new_df_of_tfidf['target_list'] = [i for i in y_tag_dtm] 


    # In[25]:

    y = new_df_of_tfidf['target_list'] 
    X = new_df_of_tfidf.drop('target_list',axis=1)  


    X = np.array(X.values.tolist())                           # it will convert list to numpy ndarray
    y = np.array(y.values.tolist())


    # In[28]:

    # print (X[0]) 


    # In[29]:

    pca_X = PCA(n_components=200).fit_transform(X)  
    pca_X = np.round(pca_X,4)

    pca_y = PCA(n_components=50).fit_transform(y)  
    pca_y = np.round(pca_y,4)


    # In[30]:

    print (pca_y) 


    # In[31]:

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)   


    # In[32]:

    # X_train, X_test, y_train, y_test = train_test_split(pca_X, pca_y, test_size=0.2, random_state=1)   


    # In[ ]:




    # In[33]:

    # clf = Pipeline([('classifier',OneVsRestClassifier(SVC(probability=True,random_state=0)))])  # just to for Pipeline example

    knn_clf = KNeighborsClassifier(n_neighbors=5)
    # mnb_clf = MultinomialNB()                                                                   # not working for MultiLabelinput
    # svc_clf = OneVsRestClassifier(SVC(probability=True,random_state=0))

    # time_pass_y = np.random.randint(2,size=(2838,1))                                            # produce ndarray of size 2838 X 1

    knn_clf.fit(X_train, y_train)
    # mnb_clf.fit(X_train, y_train) 

    knn_pred = knn_clf.predict(X_test)  
    # mnb_pred = mnb_clf.predict(X_test)
    # svc_pred = svc_clf.predict(X_test)


    # In[34]:

    knn_clf.score(X_test, y_test) 


    # In[53]:

    from sklearn import metrics

    knn_report = metrics.classification_report(y_test[:100], knn_pred[:100]) 
    knn_f1_score = metrics.f1_score(y_test[:], knn_pred[:], average='samples') 
    knn_precision_recall_fscore = metrics.precision_recall_fscore_support(y_test, knn_pred, average='samples')  # on full data-set
    knn_avg_precision_score = metrics.average_precision_score(y_test, knn_pred, average='samples')
    knn_roc_auc_score = metrics.roc_auc_score(y_test, knn_pred, average='samples')

    # mnb_report = metrics.classification_report(y_test[:100], mnb_pred[:100])  #throwing error mnb_clf can't work on multilabel O/P


    # In[36]:

    metrics.accuracy_score(y_true=y_test[:100], y_pred=knn_pred[:100])          # I think it's same as calculating hamming_score


    # In[37]:

    # print (knn_report)                                   # its type is str

    print "For knn_clf (KNearestNeighbours) : "
    print "precision, recall, fbeta_score, support : ",knn_precision_recall_fscore
    print "f1_score : ",knn_f1_score
    print "avg. precision_score : ",knn_avg_precision_score 
    print "roc_auc_score : ",knn_roc_auc_score


    # In[38]:

    # def does_test_tag_match(d, list_of_tags):      # no need for this function


    # In[39]:

    test = ["how to use policy iteration in ml ?"]
    # test = ["what is lstm ?"] 

    # test_dtm = vect_title.transform(test)                                           # without tfidf
    test_dtm = tfidf_vect_title.transform(test)                                       # with tfidf

    # print (test_dtm.toarray()[0])
    status = False
    for i in test_dtm.toarray()[0]:
        if (i!=0):
            status = True
            break

    ans = knn_clf.predict(test_dtm.toarray())
    ans = mlb.inverse_transform(ans)

    if (len(ans[0])==0 or status==False):
        print ("sorry, we can't predict your category!!!")
    else:
        ans = le.inverse_transform(ans)
        print (ans)
        
        

    forest = RandomForestClassifier(n_estimators=100, random_state=0)
    rf_clf = MultiOutputClassifier(forest, n_jobs=-1)
    rf_clf.fit(X_train, y_train)
    rf_pred = rf_clf.predict(X_test)


    # In[41]:

    rf_clf 


    # In[42]:

    metrics.accuracy_score(y_true=y_test[:100], y_pred=rf_pred[:100])          # I think it's same as calculating hamming_score


    # In[43]:

    rf_clf.score(X_test, y_test)

    rf_report = metrics.classification_report(y_test[:100], rf_pred[:100])
    rf_f1_score = metrics.f1_score(y_test, rf_pred, average='samples')  
    rf_precision_recall_fscore = metrics.precision_recall_fscore_support(y_test, rf_pred, average='samples')  # on full data-set
    rf_avg_precision_score = metrics.average_precision_score(y_test, rf_pred, average='samples')
    rf_roc_auc_score = metrics.roc_auc_score(y_test, rf_pred, average='samples') 


    # In[47]:

    # print (rf_report) 

    print "For rf_clf (RandomForest) : "
    print "precision, recall, fbeta_score, support : ",rf_precision_recall_fscore
    print "f1_score : ",rf_f1_score  
    print "avg. precision_score : ",rf_avg_precision_score 
    print "roc_auc_score : ",rf_roc_auc_score

    # test = ["what is reinforcement learning ?"] 

    test = ["what is ai,lstm and data visualization ?"] 

    # test_dtm = vect_title.transform(test)                                            # without tfidf
    test_dtm = tfidf_vect_title.transform(test)                                        # with tfidf

    status = False
    for i in test_dtm.toarray()[0]:
        if (i!=0):
            status = True
            break

    ans = rf_clf.predict(test_dtm.toarray())
    ans = mlb.inverse_transform(ans)
    if (len(ans[0])==0 or status==False):
        print ("sorry, we can't predict your category!!!")
    else:
        ans = le.inverse_transform(ans)
        print (ans)
Exemplo n.º 23
0
def setup(train_files, test_files, specific):
    scalar = StandardScaler()
    mlb = MultiLabelBinarizer()
    train_labels = []
    train_data = []
    train_keys = []
    for f in train_files.keys():
        path = constants.path + 'acousticbrainz-mediaeval-train/' + f[:
                                                                      2] + '/' + f + '.json'
        song = readjson(path)
        feat = getFeature(song)
        if len(feat) != 391:
            continue
        train_keys.append(f)
        train_data.append(feat)
        train_labels.append(train_files[f])
    print('finished train')
    train_labels = mlb.fit_transform(train_labels)
    train_data = scalar.fit_transform(train_data)
    print('finished transforming')
    path = constants.path + specific + '_mlb.pkl'
    dump(mlb, path)

    path = constants.path + specific + '_scalar.pkl'
    dump(scalar, path)

    path = constants.path + specific + '_train.pkl'
    data = dict()
    data['features'] = train_data
    data['labels'] = train_labels
    data['keys'] = train_keys
    dump(data, path)

    print('finished dumping')
    #classifier = MultiOutputClassifier(LinearSVC(C=10, class_weight='balanced', dual=True), n_jobs = 4)
    classifier = MultiOutputClassifier(RandomForestClassifier(
        n_estimators=20, class_weight='balanced'),
                                       n_jobs=4)
    classifier.fit(train_data, train_labels)
    print('finished fitting')
    data = 0
    train_data = 0
    train_labels = 0
    train_keys = 0
    gc.collect()

    #test_labels = []
    test_data = []
    test_keys = list(test_files.keys())
    mean = scalar.mean_
    for f in test_keys:
        path = constants.path + 'acousticbrainz-mediaeval-train/' + f[:
                                                                      2] + '/' + f + '.json'
        song = readjson(path)
        feat = getFeature(song)
        if len(feat) < 391:
            length = len(feat)
            for m in mean[length:]:
                feat += [m]
        test_data.append(feat)
    test_data = scalar.transform(test_data)
    predictions = classifier.predict(test_data)
    print('finished predictions')
    genre_predictions = mlb.inverse_transform(predictions)
    write(genre_predictions, test_keys, specific)
    print('finished writing predictions')
Exemplo n.º 24
0
for key, value in sample_files.items():
    value['data'] = get_training_data("data/" + key, num_training_lists)

# Combine all the training data arrays into one big feature set
X = np.vstack(list(map(lambda x: x['data'], sample_files.values())))
# X = normalize(X)

# Build a label list that corresponds to the feature set
y = []
for value in sample_files.values():
    y += [value['labels']] * len(value['data'])
y = np.array(mlb.transform(y))

# Use a multi-label classifier implementing Multinomial Naive Bayes
clf = MultiOutputClassifier(MultinomialNB())
clf.fit(X, y)

print(f'Mean accuracy: {clf.score(X, y)}')

num_folds = 10
cv_score = cross_val_score(clf, X, y, cv=num_folds)
print(f'{num_folds}-fold cross-validation: {cv_score}')

# Perform real-time tests for each input file
for key, value in sample_files.items():
    print("\nPerforming real-time classification of "
          f"{', '.join(value['labels'])}")
    start_time = timeit.default_timer()
    features = Serializer("data/" + key).classify_realtime(clf)
    total_time = timeit.default_timer() - start_time
    print(f'Classified in {total_time} seconds')
Exemplo n.º 25
0
def mycode(train_files, test_files, specific, indicies):
    indicies = np.array(indicies)
    #indicies = indicies[:len(indicies)//4]
    scalar = StandardScaler()
    mlb = MultiLabelBinarizer()
    train_labels = []
    train_data = []
    train_keys = []
    keys = list(train_files.keys())
    random.shuffle(keys)
    subset = 150000  #len(keys)
    count = 0
    for f in keys[:subset]:
        count += 1
        path = constants.path + 'acousticbrainz-mediaeval-train/' + f[:
                                                                      2] + '/' + f + '.json'
        song = readjson(path)
        feat = getAllFeatures(song)
        if len(feat) != 2647:
            continue
        feat = np.array(feat)
        feat = feat[indicies]
        train_keys.append(f)
        train_data.append(feat)
        train_labels.append(train_files[f])
        if count % 10000 == 0:
            print("on ", count, "length of keys: ", len(train_keys))

    print('finished train')
    train_labels = mlb.fit_transform(train_labels)
    train_data = scalar.fit_transform(train_data)
    print('finished transforming')
    path = constants.path + specific + '_all2_mlb.pkl'
    dump(mlb, path)

    path = constants.path + specific + '_all2_scalar.pkl'
    dump(scalar, path)

    print(np.shape(train_data))
    path = constants.path + specific + '_all2_train.pkl'
    data = dict()
    data['features'] = train_data
    data['labels'] = train_labels
    data['keys'] = train_keys
    dump(data, path)

    print('finished dumping')
    #classifier = MultiOutputClassifier(LinearSVC(C=10, class_weight='balanced', dual=True), n_jobs = 4)
    classifier = MultiOutputClassifier(RandomForestClassifier(
        n_estimators=32, class_weight='balanced'),
                                       n_jobs=4)
    data = 0
    train_files = 0
    train_keys = 0
    keys = 0
    gc.collect()
    classifier.fit(train_data, train_labels)
    print('finished fitting')
    path = constants.path + specific + '_all2_classifier.pkl'
    dump(classifier, path)
    """
    with open(constants.path + specific + '_all2_scalar.pkl', 'rb') as data_file:
        scalar = pickle.load(data_file)
    with open(constants.path + specific + '_all2_mlb.pkl', 'rb') as data_file:
        mlb = pickle.load(data_file)
    with open(constants.path + specific + '_all2_classifier.pkl', 'rb') as data_file:
        classifier = pickle.load(data_file)
    """
    data = 0
    train_data = 0
    train_labels = 0
    train_keys = 0
    gc.collect()

    #test_labels = []
    test_data = []
    test_keys = list(test_files.keys())
    mean = scalar.mean_
    for f in test_keys:
        path = constants.path + 'acousticbrainz-mediaeval-train/' + f[:
                                                                      2] + '/' + f + '.json'
        song = readjson(path)
        feat = getAllFeatures(song)
        """
        if len(feat) < 2647:
            length = len(feat)
            for m in mean[length:]:
                feat += [m]
        """
        #feat = np.array(feat)
        if len(feat) < 2647:
            length = len(feat)
            print('Before: ', length)
            for m in range(2647 - length):
                feat += [np.random.rand()]
            #m = mean[indicies.index(2647)]
            #feat += [m]
            print('After: ', len(feat))
        feat = np.array(feat)
        feat = feat[indicies]
        test_data.append(feat)
    test_data = scalar.transform(test_data)
    predictions = classifier.predict(test_data)
    print('finished predictions')
    genre_predictions = mlb.inverse_transform(predictions)
    write(genre_predictions, test_keys, specific)
    print('finished writing predictions')
Exemplo n.º 26
0
def main():

    # Script argument parsing
    parser = argparse.ArgumentParser(
        description=
        'Homework 03 - Machine learning a.a. 2018/19 - Predict missing values',
        epilog=' coded by: Emanuele Palombo')

    parser.add_argument('dataset_name',
                        metavar='DATASET',
                        type=str,
                        nargs='?',
                        default=__default_ts_name,
                        help='{} (default {}) - dataset name'.format(
                            list(__ts_opts.keys()), __default_ts_name))

    parser.add_argument(
        '--test-size',
        '-t',
        dest='test_size',
        action='store',
        metavar='TEST_SIZE',
        type=float,
        default=__default_test_size,
        help='[0-1] (default {}) - splitting size of TestSet'.format(
            __default_test_size))

    parser.add_argument(
        '--question-marks-ts',
        '-q',
        dest='qm_repeted_ts',
        action='store',
        type=int,
        default=__default_question_mark_count_repeated,
        help=
        '{{0,1,2...}} (default {}) - (this value * {} * samples) added to TrainingSet'
        .format(__default_question_mark_count_repeated,
                __default_question_mark_count))

    parser.add_argument(
        '--no-split',
        '-s',
        dest='no_split',
        action='store_true',
        default=__default_no_split,
        help='(default {}) - keep whole DataSet for training'.format(
            __default_no_split))

    parser.add_argument('--img-tag',
                        '-i',
                        dest='img_tag',
                        action='store',
                        type=str,
                        default='',
                        help='string - add arbitrary string to saved images')

    parser.add_argument(
        '--verbose',
        '-v',
        dest='verbosity',
        action='count',
        default=__default_training_verbosity,
        help='add more verbosity to output (repeat it to increase)')

    args = parser.parse_args()

    if args.dataset_name not in __ts_opts:
        print('ERROR: Choose correct DataSet!\n')
        parser.print_help()
        exit(1)

    trainingset_selected_name = args.dataset_name
    test_size = args.test_size
    qm_repeted_ts = args.qm_repeted_ts
    dataset_no_split = args.no_split
    training_verbosity = args.verbosity
    img_tag = args.img_tag
    running_id = id_generator()

    ts_selected_opts = __ts_opts[trainingset_selected_name]
    # End script argument parsing

    print('\nDataSet selected: ' + ts_selected_opts['url'])

    # read dataset to pandas dataframe
    dataset = pd.read_csv(ts_selected_opts['url'],
                          names=ts_selected_opts['columns'])

    if training_verbosity >= 1:
        print('\nFirst five rows of DataSet:\n')
        print(dataset.head())
        print('\nDataSet Length: {}'.format(len(dataset)))

    # DataSet Manipulation
    # remove row with question marks (this avoid to have '?' on the output)
    dataset = dataset[~(dataset.astype(str) == '?').any(1)]

    # strip out (remove) the "real output" (y)
    dataset = dataset.iloc[ts_selected_opts['x_slice'][0],
                           ts_selected_opts['x_slice'][1]]

    # Different approach to value conversion
    # convert all column to int (str => int)
    # dataset = dataset.apply(lambda x: pd.factorize(x)[0] + 1)
    # convert all columns to int
    dataset = dataset.astype(int)

    # dataSet Information
    features_count = len(dataset.columns)
    features_values = ds_features_values(dataset)

    # copy input features to output (columns * 2)
    for column in dataset.columns:
        dataset['y_' + column] = dataset[column]

    # Split DataSet
    training_set, test_set = train_test_split(
        dataset,
        test_size=test_size,
        random_state=__default_train_test_split_random_state)

    # check feature values between TrainingSet and TestSet
    # it's important avoid more value on TestSet (ie. error on log_loss for mismatch in predict_proba size)
    if not check_labels_split(features_count, training_set, test_set):
        exit(1)

    # Concat (add row) TrainingSet and TestSet
    # in this case model could see all sample (included queries without '?')
    if dataset_no_split:
        training_set = pd.concat([training_set, test_set], axis=0)

        print('\nTraining over the whole DataSet')
    else:
        print('\nSplit DataSet in TrainingSet and TestSet (test size: {})'.
              format(test_size))

    # add (append) question mark
    # append qm_count rows, with 1 to qm_count '?'
    qm_count = int(ts_selected_opts['question_mark_count'])
    for i in range(qm_repeted_ts):
        for value_count in range(1, qm_count + 1):
            training_set = ds_mod_with_value(training_set, value_count,
                                             features_count, True)

            if training_verbosity >= 1:
                print(
                    '{} Added {} question mark (?) to TrainingSet for each sample'
                    .format(i, value_count))

    # Shuffle TrainingSet
    training_set = training_set.sample(frac=1)

    if training_verbosity >= 1:
        print('\nManipulated TrainingSet:\n')
        print(training_set.head())
        print('\nTrainingSet Length: {}'.format(len(training_set)))

    # TrainingSet: input X (features) and Output y ("mirrored" features))
    x_train = training_set.iloc[:, 0:features_count]
    y_train = training_set.iloc[:, features_count:]

    # TestSet: input X (features) and Output y ("mirrored" features))
    x_test = test_set.iloc[:, 0:features_count]
    y_test = test_set.iloc[:, features_count:]

    if training_verbosity >= 2:
        print('\nInput train:\n {}'.format(x_train.head()))
        print('\nOutput train:\n {}'.format(y_train.head()))
        print('\nInput test:\n {}'.format(x_test.head()))
        print('\nOutput test:\n {}'.format(y_test.head()))

    x_train = x_train.values
    y_train = y_train.values
    y_test = y_test.values

    # oneHot encoding (characteristic vector)
    # passing features_values without None force OneHotEncoder to transform None to null vector
    one_hot_encoder = OneHotEncoder(categories=features_values,
                                    handle_unknown='ignore')
    one_hot_encoder.fit(x_train)
    x_train_encoded = one_hot_encoder.transform(x_train).toarray()

    if training_verbosity >= 2:
        print('\nOneHotEncoding...\nexample: {} => {}'.format(
            x_train[0], x_train_encoded[0]))

    # store all results/metrics for each model/classifier
    results = {}

    for classifier_name in __deafult_model_classifier:

        filename = 'model_{}_{}.sav'.format(trainingset_selected_name,
                                            classifier_name)

        if os.path.isfile(filename):
            # load module already trained
            multi_output_classifier = joblib.load(filename)

            print(
                '\n### Model {} loaded by file: {}\nImportant: remove the file to re-train the model!'
                .format(classifier_name, filename))
        else:
            n_jobs = None
            model_verbosity = True if training_verbosity >= 3 else False

            if classifier_name == 'MLP':
                classifier = MLPClassifier(hidden_layer_sizes=ts_selected_opts[
                    'mlp_hidden_layers_sizes'],
                                           max_iter=1000,
                                           verbose=model_verbosity)
            elif classifier_name == 'KNN':
                n_jobs = None
                classifier = KNeighborsClassifier(
                    n_neighbors=ts_selected_opts['knn_k'])
            elif classifier_name == 'SVM':
                classifier = SVC(gamma='scale',
                                 decision_function_shape='ovo',
                                 probability=True,
                                 verbose=model_verbosity)
            elif classifier_name == 'RandomForest':
                classifier = RandomForestClassifier(
                    n_estimators=ts_selected_opts['random_forest_estimator'],
                    verbose=model_verbosity)

            print('\n### Init and training the model: {}'.format(
                classifier_name))

            # init MultiOutput for classifier
            multi_output_classifier = MultiOutputClassifier(classifier,
                                                            n_jobs=n_jobs)
            multi_output_classifier.fit(x_train_encoded, y_train)

            # save the model to disk
            joblib.dump(multi_output_classifier, filename)

        results[classifier_name] = collections.defaultdict(list)
        metris_result = results[classifier_name]

        # create input test (query) with different number of '?'
        for query_count_question_mark in range(
                ts_selected_opts['question_mark_count'] + 1):

            print('\n## Add {} questions mark to input test (query)'.format(
                query_count_question_mark))

            # modify (in place) input test with question marks
            x_test_with_qm = ds_mod_with_value(
                x_test.copy(),
                value_count=query_count_question_mark,
                append=False)

            if training_verbosity >= 2:
                print('\nInput test (query):\n {}'.format(
                    pd.DataFrame(data=x_test_with_qm).head()))

            # encode the input test
            x_test_encoded = one_hot_encoder.transform(
                x_test_with_qm).toarray()

            # compute output prediction and probability
            y_pred = multi_output_classifier.predict(x_test_encoded)
            y_pred_proba = multi_output_classifier.predict_proba(
                x_test_encoded)
            # precision on whole output
            score = multi_output_classifier.score(x_test_encoded, y_test)
            # the Hamming loss corresponds to the Hamming distance between y_test and y_pred
            hamming_loss = np.sum(np.not_equal(y_test, y_pred)) / float(
                y_test.size)

            # compute y_test and y_pred how if the out was only the query question marks
            y_test_reduced, y_pred_reduced = reduce_y_to_qm(
                x_test_with_qm, y_test, y_pred)

            # write y_pred_proba to file (csv)
            write_pred_proba(
                y_pred_proba,
                '{}{}-{}-q{}-{}{}.csv'.format(__default_csv_path,
                                              trainingset_selected_name,
                                              classifier_name,
                                              query_count_question_mark,
                                              running_id, img_tag))

            print('\nMetrics:')
            print(' {:<30} | {:^10} | {:>10}'.format('features', 'accuracy',
                                                     'log loss'))
            print('-' * (30 + 10 + 10 + 7))

            log_loss_avg = 0
            # for each output column => compute accuracy and log_loss
            for feature_index in range(y_test.shape[1]):
                y_test_column = y_test[:, feature_index]
                y_pred_column = y_pred[:, feature_index]

                accuracy = accuracy_score(y_test_column, y_pred_column)
                # note: for avoid error here was implemented check_labels_split()
                log_loss_value = log_loss(
                    y_test_column,
                    y_pred_proba[feature_index],
                    labels=features_values[feature_index])

                print(' {:<30} | {:^10.4f} | {:>10.4f}'.format(
                    test_set.columns[feature_index], accuracy, log_loss_value))

                log_loss_avg += log_loss_value

                metris_result['accuracy_' +
                              str(feature_index)].append(accuracy)
                metris_result['log_loss_' +
                              str(feature_index)].append(log_loss_value)

            print('\nVirtual reduced output:')
            # for each output reduced (only question marks) => compute accuracy
            for index in range(query_count_question_mark):
                accuracy = accuracy_score(y_test_reduced[:, index],
                                          y_pred_reduced[:, index])
                print(' accuracy {}:   {:>10.4f}'.format(index, accuracy))

                metris_result['accuracy_reduced_' +
                              str(index)].append(accuracy)

            print('\nAll output:')
            print(' accuracy:     {:>10.4f}'.format(score))
            print(' log_loss avg: {:>10.4f}'.format(log_loss_avg /
                                                    y_test.shape[1]))
            print(' hamming loss: {:>10.4f}'.format(hamming_loss))

            metris_result['accuracy'].append(score)
            metris_result['log_loss_avg'].append(log_loss_avg /
                                                 y_test.shape[1])
            metris_result['hamming_loss'].append(hamming_loss)

        # GRAPH PLOT per model/classifier
        plot_line_graph(range(ts_selected_opts['question_mark_count'] + 1), [
            results[classifier_name]['accuracy'],
            results[classifier_name]['log_loss_avg'],
            results[classifier_name]['hamming_loss']
        ],
                        labels=['accuracy', 'log loss avg', 'hamming loss'],
                        fmt=['bo-', 'ro-', 'yo-'],
                        title=classifier_name,
                        xlabel='Number of Question Marks in the query',
                        ymax=1)

        if __default_save_img:
            plt.savefig('{}{}-{}-{}{}.png'.format(__default_imgs_path,
                                                  trainingset_selected_name,
                                                  classifier_name, running_id,
                                                  img_tag),
                        dpi=200)

        # create list of list of accuracy x feature
        accuracy_lst = [
            'accuracy_' + str(index) for index in range(features_count)
        ]
        accuracy_lst = [
            results[classifier_name][accuracy_key]
            for accuracy_key in accuracy_lst
        ]

        plot_line_graph(range(ts_selected_opts['question_mark_count'] + 1),
                        [results[classifier_name]['accuracy']] + accuracy_lst,
                        fmt=['bo-'] + ['g.--'] * len(accuracy_lst),
                        title=classifier_name +
                        ': whole accuracy and those by features',
                        xlabel='Number of Question Marks in the query',
                        ylabel='accuracy',
                        ymax=1)

        if __default_save_img:
            plt.savefig('{}{}-{}-accuracy-{}{}.png'.format(
                __default_imgs_path, trainingset_selected_name,
                classifier_name, running_id, img_tag),
                        dpi=200)

        # create list of list of accuracy_reduced x feature (adding 0 in front when needed)
        accuracy_reduced_lst = [
            'accuracy_reduced_' + str(index)
            for index in range(ts_selected_opts['question_mark_count'])
        ]
        accuracy_reduced_lst = [
            results[classifier_name][accuracy_reduced]
            for accuracy_reduced in accuracy_reduced_lst
        ]
        accuracy_reduced_lst = [[None] *
                                (ts_selected_opts['question_mark_count'] -
                                 len(accuracy_reduced) + 1) + accuracy_reduced
                                for accuracy_reduced in accuracy_reduced_lst]

        plot_line_graph(
            range(ts_selected_opts['question_mark_count'] + 1),
            [results[classifier_name]['accuracy']] + accuracy_reduced_lst,
            fmt=['bo-'] + ['m.--'] * len(accuracy_reduced_lst),
            title=classifier_name +
            ': whole accuracy and the virtual accuracies by features',
            xlabel='Number of Question Marks in the query',
            ylabel='accuracy',
            ymax=1)

        if __default_save_img:
            plt.savefig('{}{}-{}-accuracy-reduced-{}{}.png'.format(
                __default_imgs_path, trainingset_selected_name,
                classifier_name, running_id, img_tag),
                        dpi=200)

        # create list of list of log_loss x feature
        log_loss_lst = [
            'log_loss_' + str(index) for index in range(features_count)
        ]
        log_loss_lst = [
            results[classifier_name][log_loss_key]
            for log_loss_key in log_loss_lst
        ]

        plot_line_graph(
            range(ts_selected_opts['question_mark_count'] + 1),
            [results[classifier_name]['log_loss_avg']] + log_loss_lst,
            fmt=['ro-'] + ['c.--'] * len(log_loss_lst),
            title=classifier_name + ': average log loss and those by features',
            xlabel='Number of Question Marks in the query',
            ylabel='log loss')

        if __default_save_img:
            plt.savefig('{}{}-{}-log-loss-{}{}.png'.format(
                __default_imgs_path, trainingset_selected_name,
                classifier_name, running_id, img_tag),
                        dpi=200)

    metrics_by_classifier = [
        results[classifier][metric]
        for classifier in __deafult_model_classifier
        for metric in ['accuracy', 'log_loss_avg', 'hamming_loss']
    ]
    label_by_classifier = [
        classifier + ' ' + metric for classifier in __deafult_model_classifier
        for metric in ['accuracy', 'log_loss_avg', 'hamming_loss']
    ]
    fmt_lst = [
        style.replace('0', character)
        for character in ['o', '^', 'v', '<', '>', '.', ',', '+', 'x']
        for style in ['b0-', 'r0-', 'y0-']
    ]

    # GRAPH PLOT comparing model/classifier
    plot_line_graph(range(ts_selected_opts['question_mark_count'] + 1),
                    metrics_by_classifier,
                    labels=label_by_classifier,
                    fmt=fmt_lst,
                    title='Compare all model',
                    xlabel='Number of Question Marks in the query',
                    ylabel='',
                    ymax=1)

    if __default_save_img:
        plt.savefig('{}{}-comparing-{}{}.png'.format(
            __default_imgs_path, trainingset_selected_name, running_id,
            img_tag),
                    dpi=200)

    if not __default_save_img:
        plt.show()
Exemplo n.º 27
0
]]
y_train = train_data[[
    col for col in train_data.columns if col.startswith('target')
]].drop(['target_0'], axis=1)
X_test = test_data[[
    col for col in test_data.columns if col.startswith('feat')
]]
y_test = test_data[[
    col for col in test_data.columns if col.startswith('target')
]].drop(['target_0'], axis=1)

############################################################

# train classifier
model = MultiOutputClassifier(DummyClassifier(strategy='stratified'))
model.fit(X_train, y_train)

# evaluate test data
y_pred = model.predict(X_test)
run.log('precision_macro', precision_score(y_test, y_pred, average='macro'))
run.log('precision_samples', precision_score(y_test, y_pred,
                                             average='samples'))
run.log('recall_macro', recall_score(y_test, y_pred, average='macro'))
run.log('recall_samples', recall_score(y_test, y_pred, average='samples'))
run.log('hamming_loss', hamming_loss(y_test, y_pred))
run.log('zero_one_loss', zero_one_loss(y_test, y_pred))

# evaluate train data
y_pred = model.predict(X_train)
run.log('precision_macro_train',
        precision_score(y_train, y_pred, average='macro'))
Exemplo n.º 28
0
                             max_iter=500,
                             class_weight="balanced",
                             random_state=42,
                             n_jobs=1))
else:
    model = MultiOutputRegressor(
        LassoCV(eps=1e-9,
                n_alphas=16,
                cv=3,
                tol=1e-4,
                max_iter=500,
                random_state=42,
                n_jobs=1))

# train the model
model.fit(X.iloc[train_idx, :], Y.iloc[train_idx, :])

# In[2]: Collect the predictions

# predict training and testing data
train_predict = pd.DataFrame(model.predict(X.iloc[train_idx, :]),
                             columns=Y.columns)
test_predict = pd.DataFrame(model.predict(X.iloc[test_idx, :]),
                            columns=Y.columns)

# reshape all of the predictions into a single table
predictions = pd.DataFrame()
for j in range(outputs):
    # collect training data
    predict_j = np.array(train_predict.iloc[:, j])
    actual_j = np.array(Y.iloc[train_idx, j])
Exemplo n.º 29
0
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=11)

gbt = MLPClassifier(alpha=0.0,
                    random_state=0,
                    activation='relu',
                    hidden_layer_sizes=(50, ),
                    verbose=0)

from sklearn.multioutput import MultiOutputClassifier
#gbt = RandomForestClassifier(n_estimators=300, random_state=264, min_samples_leaf=300, min_samples_split=150)
#MultiOutputClassifier(gbt,n_job=-1).fit(X_train, Y_train)
mor = MultiOutputClassifier(gbt, n_jobs=-1)
clf4 = mor.fit(X_train, Y_train)

err_train = np.mean(Y_train != mor.predict(X_train))
err_test = np.mean(Y_test != mor.predict(X_test))
err_sum = np.mean(Y != mor.predict(X))
joblib.dump(mor, "training_models/cardio_hi.pkl", compress=1)

print("gender 0", err_train, err_test, 'err_sum', err_sum)
print("gbt score %s" % clf4.score(X_train, Y_train))

print()
print("start ap_lo")
X = data.drop([
    'cardio', 'ap_lo', 'id', 'weight_o', 'weight_nfg_o', 'weight_nfg_o_с',
    'weight_o_c', 'alco', 'bmi_r_4', 'bmi_n_7', 'bmi_r_1', 'bmi_n_2', 'bmi_n_1'
],
Exemplo n.º 30
0
    'worksi1f', 'cleanair', 'sporty1f', 'alcoholf', 'cancerf', 'menarche',
    'avdaysp', 'infpro1f', 'ocs'
]
output_cols = ['WEIGHT', 'MODEDELIV', 'BABYCONDIT', 'Status']

data = pd.read_excel('/home/dell/Desktop/PALS_data.XLSX')
print data.head()

X = data.loc[:, feature_cols]
y = data.loc[:, output_cols]
print X.shape

le = preprocessing.LabelEncoder()
y.WEIGHT = le.fit_transform(y.WEIGHT)

le = preprocessing.LabelEncoder()
y.MODEDELIV = le.fit_transform(y.MODEDELIV)

le = preprocessing.LabelEncoder()
y.BABYCONDIT = le.fit_transform(y.BABYCONDIT)

le = preprocessing.LabelEncoder()
X.ocs = le.fit_transform(X.ocs)

clf = RandomForestClassifier(n_estimators=100, random_state=1)
#clf = svm.SVC(gamma=0.001, C=1000., kernel = 'linear')
multi_target_forest = MultiOutputClassifier(clf, n_jobs=-1)
multi_target_forest.fit(X, y)

print "success"
Exemplo n.º 31
0
target = ['Match_result']

worldcup = shuffle(worldcup)
x = worldcup[features].values
y = worldcup[target].values
# Split the dataset into training dataset and testing dataset
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1)
#===================Perceptron=========================
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import Perceptron
ppn = Perceptron(n_iter=40, eta0=0.1, random_state=0)  #y=w.x+b
multi_target_ppn = MultiOutputClassifier(ppn)
y_pred = multi_target_ppn.fit(x_train, y_train).predict(x_test)
print('Perceptron')
print(classification_report(y_test, y_pred))
print('Accuracy classification score: %.2f' % accuracy_score(y_test, y_pred))
print('Average Hamming loss: %.2f' % hamming_loss(y_test, y_pred))
print('Jaccard similarity coefficient score: %.2f' %
      jaccard_similarity_score(y_test, y_pred))
print('Matthews correlation coefficient (MCC): %.2f' %
      matthews_corrcoef(y_test, y_pred))
print('Zero-one classification loss: %.2f' % zero_one_loss(y_test, y_pred))

#===================SVM=========================
from sklearn.multioutput import MultiOutputClassifier
from sklearn import svm
#调用SVC()
clf = svm.SVC()
audios = np.unique(mfcc_audio["Audio"])
train_audio, test_audio = train_test_split(
    audios, train_size=0.7, test_size=0.3, random_state=0)

X_train = mfcc_audio[mfcc_audio["Audio"].isin(train_audio)]
X_test = mfcc_audio[mfcc_audio["Audio"].isin(test_audio)]
y_train = X_train[columns]
y_test = X_test[columns]

X_train.drop(columns + ["Audio"], inplace=True, axis=1)
X_test.drop(columns + ["Audio"], inplace=True, axis=1)

mor = MultiOutputClassifier(
    RandomForestClassifier(random_state=0, n_estimators=1000), n_jobs=-1)
mor.fit(X_train, y_train)
mor_pred = mor.predict(X_test)

dummy = DummyClassifier()
dummy.fit(X_train, y_train)
dummy_pred = dummy.predict(X_test)

estimators = mor.estimators_

for i, col in enumerate(columns):

    true = y_test[col]
    pred = mor_pred[:, i]
    d_p = dummy_pred[:, i]

    print(col)
def train_model_one_vs_rest(train_vectors, train_labels):
    model = RandomForestClassifier()
    clf = MultiOutputClassifier(model)
    clf.fit(train_vectors, train_labels)
    return clf
Exemplo n.º 34
0
estimators = MultiOutputClassifier(
    estimator=XGBClassifier(penalty="l2", objective="binary:logistic", 
                            random_state=42)
)

X_train, X_eval, y_train, y_eval = train_test_split(
    features_df,
    labels_df,
    test_size=0.33,
    shuffle=True,
    stratify=labels_df,
    random_state=RANDOM_SEED
)

# Train model
estimators.fit(features_df, labels_df)

# Predict on evaluation set
# This competition wants probabilities, not labels
preds = estimators.predict_proba(X_eval)
preds
k = preds[0]
y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": preds[0][:, 1],
        "seasonal_vaccine": preds[1][:, 1],
    },
    index = y_eval.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()
Exemplo n.º 35
0
class Igel(object):
    """
    Igel is the base model to use the fit, evaluate and predict functions of the sklearn library
    """
    available_commands = ('fit', 'evaluate', 'predict', 'experiment')
    supported_types = ('regression', 'classification', 'clustering')
    results_path = configs.get('results_path')  # path to the results folder
    default_model_path = configs.get(
        'default_model_path')  # path to the pre-fitted model
    description_file = configs.get(
        'description_file')  # path to the description.json file
    evaluation_file = configs.get(
        'evaluation_file')  # path to the evaluation.json file
    prediction_file = configs.get(
        'prediction_file')  # path to the predictions.csv
    default_dataset_props = configs.get(
        'dataset_props'
    )  # dataset props that can be changed from the yaml file
    default_model_props = configs.get(
        'model_props')  # model props that can be changed from the yaml file
    model = None

    def __init__(self, **cli_args):
        logger.info(f"Entered CLI args: {cli_args}")
        logger.info(f"Executing command: {cli_args.get('cmd')} ...")
        self.data_path: str = cli_args.get('data_path')  # path to the dataset
        logger.info(f"reading data from {self.data_path}")
        self.command = cli_args.get('cmd', None)
        if not self.command or self.command not in self.available_commands:
            raise Exception(f"You must enter a valid command.\n"
                            f"available commands: {self.available_commands}")

        if self.command == "fit":
            self.yml_path = cli_args.get('yaml_path')
            self.yaml_configs = read_yaml(self.yml_path)
            logger.debug(f"your chosen configuration: {self.yaml_configs}")

            # dataset options given by the user
            self.dataset_props: dict = self.yaml_configs.get(
                'dataset', self.default_dataset_props)
            # model options given by the user
            self.model_props: dict = self.yaml_configs.get(
                'model', self.default_model_props)
            # list of target(s) to predict
            self.target: list = self.yaml_configs.get('target')

            self.model_type: str = self.model_props.get('type')
            logger.info(f"dataset_props: {self.dataset_props} \n"
                        f"model_props: {self.model_props} \n "
                        f"target: {self.target} \n")

        # if entered command is evaluate or predict, then the pre-fitted model needs to be loaded and used
        else:
            self.model_path = cli_args.get('model_path',
                                           self.default_model_path)
            logger.info(f"path of the pre-fitted model => {self.model_path}")
            # load description file to read stored training parameters
            with open(self.description_file, 'r') as f:
                dic = json.load(f)
                self.target: list = dic.get(
                    "target")  # target to predict as a list
                self.model_type: str = dic.get(
                    "type"
                )  # type of the model -> regression or classification
                self.dataset_props: dict = dic.get(
                    'dataset_props')  # dataset props entered while fitting
        getattr(self, self.command)()

    def _create_model(self, **kwargs):
        """
        fetch a model depending on the provided type and algorithm by the user and return it
        @return: class of the chosen model
        """
        model_type: str = self.model_props.get('type')
        model_algorithm: str = self.model_props.get('algorithm')
        model_args = None
        if not model_type or not model_algorithm:
            raise Exception(f"model_type and algorithm cannot be None")
        algorithms: dict = models_dict.get(
            model_type)  # extract all algorithms as a dictionary
        model = algorithms.get(
            model_algorithm)  # extract model class depending on the algorithm
        logger.info(
            f"Solving a {model_type} problem using ===> {model_algorithm}")
        if not model:
            raise Exception("Model not found in the algorithms list")
        else:
            model_props_args = self.model_props.get('arguments', None)
            if model_props_args and type(model_props_args) == dict:
                model_args = model_props_args
            elif not model_props_args or model_props_args.lower() == "default":
                model_args = None

            model_class = model.get('class')
            logger.info(f"model arguments: \n"
                        f"{self.model_props.get('arguments')}")
            model = model_class(**kwargs) if not model_args else model_class(
                **model_args)
            return model, model_args

    def _save_model(self, model):
        """
        save the model to a binary file
        @param model: model to save
        @return: bool
        """
        try:
            if not os.path.exists(self.results_path):
                logger.info(
                    f"creating model_results folder to save results...\n"
                    f"path of the results folder: {self.results_path}")
                os.mkdir(self.results_path)
            else:
                logger.info(f"Folder {self.results_path} already exists")
                logger.warning(
                    f"data in the {self.results_path} folder will be overridden. If you don't "
                    f"want this, then move the current {self.results_path} to another path"
                )

        except OSError:
            logger.exception(
                f"Creating the directory {self.results_path} failed ")
        else:
            logger.info(
                f"Successfully created the directory in {self.results_path} ")
            pickle.dump(model, open(self.default_model_path, 'wb'))
            return True

    def _load_model(self, f: str = ''):
        """
        load a saved model from file
        @param f: path to model
        @return: loaded model
        """
        try:
            if not f:
                logger.info(f"result path: {self.results_path} ")
                logger.info(f"loading model form {self.default_model_path} ")
                model = pickle.load(open(self.default_model_path, 'rb'))
            else:
                logger.info(f"loading from {f}")
                model = pickle.load(open(f, 'rb'))
            return model
        except FileNotFoundError:
            logger.error(f"File not found in {self.default_model_path} ")

    def _prepare_fit_data(self):
        return self._process_data(target='fit')

    def _prepare_eval_data(self):
        return self._process_data(target='evaluate')

    def _process_data(self, target='fit'):
        """
        read and return data as x and y
        @return: list of separate x and y
        """
        assert isinstance(self.target,
                          list), "provide target(s) as a list in the yaml file"
        if self.model_type != "clustering":
            assert len(
                self.target) > 0, "please provide at least a target to predict"

        try:
            dataset = pd.read_csv(self.data_path)
            logger.info(f"dataset shape: {dataset.shape}")
            attributes = list(dataset.columns)
            logger.info(f"dataset attributes: {attributes}")

            # handle missing values in the dataset
            preprocess_props = self.dataset_props.get('preprocess', None)
            if preprocess_props:
                # handle encoding
                encoding = preprocess_props.get('encoding')
                if encoding:
                    encoding_type = encoding.get('type', None)
                    column = encoding.get('column', None)
                    if column in attributes:
                        dataset, classes_map = encode(
                            df=dataset,
                            encoding_type=encoding_type.lower(),
                            column=column)
                        if classes_map:
                            self.dataset_props[
                                'label_encoding_classes'] = classes_map
                            logger.info(
                                f"adding classes_map to dataset props: \n{classes_map}"
                            )
                        logger.info(
                            f"shape of the dataset after encoding => {dataset.shape}"
                        )

                # preprocessing strategy: mean, median, mode etc..
                strategy = preprocess_props.get('missing_values')
                if strategy:
                    dataset = handle_missing_values(dataset, strategy=strategy)
                    logger.info(
                        f"shape of the dataset after handling missing values => {dataset.shape}"
                    )

            if target == 'predict' or target == 'fit_cluster':
                x = _reshape(dataset.to_numpy())
                if not preprocess_props:
                    return x
                scaling_props = preprocess_props.get('scale', None)
                if not scaling_props:
                    return x
                else:
                    scaling_method = scaling_props.get('method', None)
                    return normalize(x, method=scaling_method)

            if any(col not in attributes for col in self.target):
                raise Exception(
                    "chosen target(s) to predict must exist in the dataset")

            y = pd.concat([dataset.pop(x) for x in self.target], axis=1)
            x = _reshape(dataset.to_numpy())
            y = _reshape(y.to_numpy())
            logger.info(f"y shape: {y.shape} and x shape: {x.shape}")

            # handle data scaling
            if preprocess_props:
                scaling_props = preprocess_props.get('scale', None)
                if scaling_props:
                    scaling_method = scaling_props.get('method', None)
                    scaling_target = scaling_props.get('target', None)
                    if scaling_target == 'all':
                        x = normalize(x, method=scaling_method)
                        y = normalize(y, method=scaling_method)
                    elif scaling_target == 'inputs':
                        x = normalize(x, method=scaling_method)
                    elif scaling_target == 'outputs':
                        y = normalize(y, method=scaling_method)

            if target == 'evaluate':
                return x, y

            split_options = self.dataset_props.get('split', None)
            if not split_options:
                return x, y, None, None
            test_size = split_options.get('test_size')
            shuffle = split_options.get('shuffle')
            stratify = split_options.get('stratify')
            x_train, x_test, y_train, y_test = train_test_split(
                x,
                y,
                test_size=test_size,
                shuffle=shuffle,
                stratify=None
                if not stratify or stratify.lower() == "default" else stratify)

            return x_train, y_train, x_test, y_test

        except Exception as e:
            logger.exception(
                f"error occured while preparing the data: {e.args}")

    def _prepare_clustering_data(self):
        """
        preprocess data for the clustering algorithm
        """
        return self._process_data(target='fit_cluster')

    def _prepare_predict_data(self):
        """
        preprocess predict data to get similar data to the one used when training the model
        """
        return self._process_data(target='predict')

    def get_evaluation(self, model, x_test, y_true, y_pred, **kwargs):
        res = None
        try:
            res = evaluate_model(model_type=self.model_type,
                                 model=model,
                                 x_test=x_test,
                                 y_pred=y_pred,
                                 y_true=y_true,
                                 get_score_only=False,
                                 **kwargs)
        except Exception as e:
            res = evaluate_model(model_type=self.model_type,
                                 model=model,
                                 x_test=x_test,
                                 y_pred=y_pred,
                                 y_true=y_true,
                                 get_score_only=True,
                                 **kwargs)
        return res

    def fit(self, **kwargs):
        """
        fit a machine learning model and save it to a file along with a description.json file
        @return: None
        """
        x_train = None
        x_test = None
        y_train = None
        y_test = None
        if self.model_type == 'clustering':
            x_train = self._prepare_clustering_data()
        else:
            x_train, y_train, x_test, y_test = self._prepare_fit_data()
        self.model, model_args = self._create_model(**kwargs)
        logger.info(
            f"executing a {self.model.__class__.__name__} algorithm...")

        # convert to multioutput if there is more than one target to predict:
        if self.model_type != 'clustering' and len(self.target) > 1:
            logger.info(
                f"predicting multiple targets detected. Hence, the model will be automatically "
                f"converted to a multioutput model")
            self.model = MultiOutputClassifier(self.model) \
                if self.model_type == 'classification' else MultiOutputRegressor(self.model)

        if self.model_type != 'clustering':
            self.model.fit(x_train, y_train)
        else:
            self.model.fit(x_train)

        saved = self._save_model(self.model)
        if saved:
            logger.info(
                f"model saved successfully and can be found in the {self.results_path} folder"
            )

        eval_results = None
        if self.model_type == 'clustering':
            eval_results = self.model.score(x_train)
        else:
            if x_test is None:
                logger.info(
                    f"no split options was provided. training score will be calculated"
                )
                eval_results = self.model.score(x_train, y_train)

            else:
                logger.info(
                    f"split option detected. The performance will be automatically evaluated "
                    f"using the test data portion")
                y_pred = self.model.predict(x_test)
                eval_results = self.get_evaluation(model=self.model,
                                                   x_test=x_test,
                                                   y_true=y_test,
                                                   y_pred=y_pred,
                                                   **kwargs)

        fit_description = {
            "model":
            self.model.__class__.__name__,
            "arguments":
            model_args if model_args else "default",
            "type":
            self.model_props['type'],
            "algorithm":
            self.model_props['algorithm'],
            "dataset_props":
            self.dataset_props,
            "model_props":
            self.model_props,
            "data_path":
            self.data_path,
            "train_data_shape":
            x_train.shape,
            "test_data_shape":
            None if x_test is None else x_test.shape,
            "train_data_size":
            x_train.shape[0],
            "test_data_size":
            None if x_test is None else x_test.shape[0],
            "results_path":
            str(self.results_path),
            "model_path":
            str(self.default_model_path),
            "target":
            None if self.model_type == 'clustering' else self.target,
            "results_on_test_data":
            eval_results,
            "cluster_centers":
            None if self.model_type != 'clustering' else
            self.model.cluster_centers_,
            "cluster_labels":
            None if self.model_type != 'clustering' else self.model.labels_,
        }

        try:
            logger.info(f"saving fit description to {self.description_file}")
            with open(self.description_file, 'w', encoding='utf-8') as f:
                json.dump(fit_description, f, ensure_ascii=False, indent=4)
        except Exception as e:
            logger.exception(
                f"Error while storing the fit description file: {e}")

    def evaluate(self, **kwargs):
        """
        evaluate a pre-fitted model and save results to a evaluation.json
        @return: None
        """
        x_val = None
        y_true = None
        eval_results = None
        try:
            model = self._load_model()
            if self.model_type != 'clustering':
                x_val, y_true = self._prepare_eval_data()
                y_pred = model.predict(x_val)
                eval_results = self.get_evaluation(model=model,
                                                   x_test=x_val,
                                                   y_true=y_true,
                                                   y_pred=y_pred,
                                                   **kwargs)
            else:
                x_val = self._prepare_clustering_data()
                y_pred = model.predict(x_val)
                eval_results = model.score(x_val, y_pred)

            logger.info(f"saving fit description to {self.evaluation_file}")
            with open(self.evaluation_file, 'w', encoding='utf-8') as f:
                json.dump(eval_results, f, ensure_ascii=False, indent=4)

        except Exception as e:
            logger.exception(f"error occured during evaluation: {e}")

    def predict(self):
        """
        use a pre-fitted model to make predictions and save them as csv
        @return: None
        """
        try:
            model = self._load_model(f=self.model_path)
            x_val = self._prepare_predict_data(
            )  # the same is used for clustering
            y_pred = model.predict(x_val)
            y_pred = _reshape(y_pred)
            logger.info(
                f"predictions shape: {y_pred.shape} | shape len: {len(y_pred.shape)}"
            )
            logger.info(f"predict on targets: {self.target}")
            df_pred = pd.DataFrame.from_dict({
                self.target[i]: y_pred[:,
                                       i] if len(y_pred.shape) > 1 else y_pred
                for i in range(len(self.target))
            })

            logger.info(f"saving the predictions to {self.prediction_file}")
            df_pred.to_csv(self.prediction_file)

        except Exception as e:
            logger.exception(f"Error while preparing predictions: {e}")

    @staticmethod
    def create_init_mock_file(model_type=None,
                              model_name=None,
                              target=None,
                              *args,
                              **kwargs):
        path = configs.get('init_file_path', None)
        if not path:
            raise Exception("You need to provide a path for the init file")

        dataset_props = Igel.default_dataset_props
        model_props = Igel.default_model_props
        if model_type:
            logger.info(f"user selected model type = {model_type}")
            model_props['type'] = model_type
        if model_name:
            logger.info(f"user selected algorithm = {model_name}")
            model_props['algorithm'] = model_name

        logger.info(f"initalizing a default igel.yaml in {path}")
        default_data = {
            "dataset":
            dataset_props,
            "model":
            model_props,
            "target": ['provide your target(s) here']
            if not target else [tg for tg in target.split()]
        }
        created = create_yaml(default_data, path)
        if created:
            logger.info(
                f"a default igel.yaml is created for you in {path}. "
                f"you just need to overwrite the values to meet your expectations"
            )
        else:
            logger.warning(
                f"something went wrong while initializing a default file")
Exemplo n.º 36
0
class ModelTrainer:
    """
    To train a merchine learning model based on the input yaml config
    """
    RAND_SEED = 42
    input_cmds = ('fit', 'evaluate', 'predict', 'experiment')
    supported_types = ('regression', 'classification', 'clustering')
    results_path = configs.get('results_path')  # path to the results folder
    default_model_path = configs.get(
        'default_model_path')  # path to the pre-fitted model
    description_file = configs.get(
        'description_file')  # path to the description.json file
    evaluation_file = configs.get(
        'evaluation_file')  # path to the evaluation.json file
    prediction_file = configs.get(
        'prediction_file')  # path to the predictions.csv
    default_dataset_props = configs.get(
        'dataset_props'
    )  # dataset props that can be changed from the yaml file
    default_model_props = configs.get(
        'model_props')  # model props that can be changed from the yaml file
    model = None

    def __init__(self, *args, **kwargs) -> None:

        self.data_path: str = kwargs.get('data_path', None)
        self.logfile = kwargs.get('logfile', None)
        self.command = kwargs.get('cmd', None)
        self.results_path = kwargs.get('results_path',
                                       None)  # path to the results folder
        self._x_columns = None
        # results_path as specified input
        if self.results_path == None:
            self.results_path = ModelTrainer.results_path  # path to the results folder
        else:
            self.default_model_path = os.path.join(self.results_path,
                                                   configs.get('model_file'))
            self.description_file = os.path.join(
                self.results_path,
                'description.json')  # path to the description.json file
            self.evaluation_file = os.path.join(
                self.results_path,
                'evaluation.json')  # path to the evaluation.json file
            self.prediction_file = os.path.join(
                self.results_path,
                'prediction.json')  # path to the predictions.csv

        logger.info(f"Entered kwargs: {kwargs}")

        if not self.command or self.command not in self.input_cmds:
            raise Exception(f"You must enter a valid command.\n"
                            f"available commands: {self.input_cmds}")

        if self.command == "fit":
            self.yml_path = kwargs.get('yaml_path', None)
            file_ext = self.yml_path.split('.')[-1]
            logger.info(f"You passed the configurations as a {file_ext} file.")

            self.yaml_configs = read_yaml(
                self.yml_path) if file_ext == 'yaml' else read_json(
                    self.yml_path)
            logger.info(f"your chosen configuration: {self.yaml_configs}")

            # dataset options given by the user
            self.dataset_props: dict = self.yaml_configs.get(
                'dataset', self.default_dataset_props)
            # model options given by the user
            self.model_props: dict = self.yaml_configs.get(
                'model', self.default_model_props)
            # list of target(s) to predict
            self.target: list = self.yaml_configs.get('target', None)
            # list of obs_id(s) to identify observation
            self.observation_id: list = self.yaml_configs.get(
                'observation_id', None)

            self.model_type: str = self.model_props.get('type', None)
            logger.info(f"dataset_props: {self.dataset_props} \n"
                        f"model_props: {self.model_props} \n "
                        f"target: {self.target} \n")

            # handle random numbers generation
            random_num_options = self.dataset_props.get('random_numbers', None)
            if random_num_options:
                generate_reproducible = random_num_options.get(
                    'generate_reproducible', None)
                if generate_reproducible:
                    logger.info(
                        "You provided the generate reproducible results option."
                    )
                    seed = random_num_options.get('seed', self.RAND_SEED)
                    np.random.seed(seed)
                    logger.info(
                        f"Setting a seed = {seed} to generate same random numbers on each experiment.."
                    )

        # if entered command is evaluate or predict, then the pre-fitted model needs to be loaded and used
        else:
            self.model_path = kwargs.get('model_path', self.default_model_path)
            logger.info(f"path of the pre-fitted model => {self.model_path}")
            # load description file to read stored training parameters
            with open(self.description_file, 'r') as f:
                dic = json.load(f)
                self.target: list = dic.get(
                    "target")  # target to predict as a list
                self.model_type: str = dic.get(
                    "type"
                )  # type of the model -> regression or classification
                self.dataset_props: dict = dic.get(
                    'dataset_props')  # dataset props entered while fitting
        getattr(self, self.command)()

    def _create_model(self, **kwargs):
        """
        fetch a model depending on the provided type and algorithm by the user and return it
        @return: class of the chosen model
        """
        model_type: str = self.model_props.get('type')
        model_algorithm: str = self.model_props.get('algorithm')
        use_cv = self.model_props.get('use_cv_estimator', None)

        model_args = None
        if not model_type or not model_algorithm:
            raise Exception(f"model_type and algorithm cannot be None")
        algorithms: dict = models_dict.get(
            model_type)  # extract all algorithms as a dictionary
        model = algorithms.get(
            model_algorithm)  # extract model class depending on the algorithm
        logger.info(
            f"Solving a {model_type} problem using ===> {model_algorithm}")
        if not model:
            raise Exception("Model not found in the algorithms list")
        else:
            model_props_args = self.model_props.get('arguments', None)
            if model_props_args and type(model_props_args) == dict:
                model_args = model_props_args
            elif not model_props_args or model_props_args.lower() == "default":
                model_args = None

            if use_cv:
                model_class = model.get('cv_class', None)
                if model_class:
                    logger.info(
                        f"cross validation estimator detected. "
                        f"Switch to the CV version of the {model_algorithm} algorithm"
                    )
                else:
                    logger.info(
                        f"No CV class found for the {model_algorithm} algorithm"
                    )
            else:
                model_class = model.get('class')
            logger.info(f"model arguments: \n"
                        f"{self.model_props.get('arguments')}")
            model = model_class(**kwargs) if not model_args else model_class(
                **model_args)
            return model, model_args

    def _save_model(self, model):
        """
        save the model to a binary file
        @param model: model to save
        @return: bool
        """
        try:
            if not os.path.exists(self.results_path):
                logger.info(
                    f"creating model_results folder to save results...\n"
                    f"path of the results folder: {self.results_path}")
                os.mkdir(self.results_path)
            else:
                logger.info(f"Folder {self.results_path} already exists")
                logger.warning(
                    f"data in the {self.results_path} folder will be overridden. If you don't "
                    f"want this, then move the current {self.results_path} to another path"
                )

        except OSError:
            logger.exception(
                f"Creating the directory {self.results_path} failed ")
        else:
            logger.info(
                f"Successfully created the directory in {self.results_path} ")
            pickle.dump(model, open(self.default_model_path, 'wb'))
            return True

    def _load_model(self, f: str = ''):
        """
        load a saved model from file
        @param f: path to model
        @return: loaded model
        """
        try:
            if not f:
                logger.info(f"result path: {self.results_path} ")
                logger.info(f"loading model form {self.default_model_path} ")
                model = pickle.load(open(self.default_model_path, 'rb'))
            else:
                logger.info(f"loading from {f}")
                model = pickle.load(open(f, 'rb'))
            return model
        except FileNotFoundError:
            logger.error(f"File not found in {self.default_model_path} ")

    def _prepare_clustering_data(self):
        """
        preprocess data for the clustering algorithm
        """
        return self._process_data(target='fit_cluster')

    def _prepare_predict_data(self):
        """
        preprocess predict data to get similar data to the one used when training the model
        """
        return self._process_data(target='predict')

    def _prepare_fit_data(self):
        return self._process_data(target='fit')

    def _prepare_eval_data(self):
        return self._process_data(target='evaluate')

    def _process_data(self, target='fit'):
        """
        read and return data as x and y
        @return: list of separate x and y
        """
        assert isinstance(self.target,
                          list), "provide target(s) as a list in the yaml file"
        if self.model_type != "clustering":
            assert len(
                self.target) > 0, "please provide at least a target to predict"

        try:
            read_data_options = self.dataset_props.get('read_data_options',
                                                       None)
            dataset = pd.read_csv(
                self.data_path) if not read_data_options else pd.read_csv(
                    self.data_path, **read_data_options)
            logger.info(f"dataset shape: {dataset.shape}")
            attributes = list(dataset.columns)
            logger.info(f"dataset attributes: {attributes}")

            # handle missing values in the dataset
            preprocess_props = self.dataset_props.get('preprocess', None)
            if preprocess_props:
                # handle encoding
                encoding = preprocess_props.get('encoding')
                if encoding:
                    encoding_type = encoding.get('type', None)
                    column = encoding.get('column', None)
                    if column in attributes:
                        dataset, classes_map = encode(
                            df=dataset,
                            encoding_type=encoding_type.lower(),
                            column=column)
                        if classes_map:
                            self.dataset_props[
                                'label_encoding_classes'] = classes_map
                            logger.info(
                                f"adding classes_map to dataset props: \n{classes_map}"
                            )
                        logger.info(
                            f"shape of the dataset after encoding => {dataset.shape}"
                        )

                # preprocessing strategy: mean, median, mode etc..
                strategy = preprocess_props.get('missing_values')
                if strategy:
                    dataset = handle_missing_values(dataset, strategy=strategy)
                    logger.info(
                        f"shape of the dataset after handling missing values => {dataset.shape}"
                    )

            if target == 'predict' or target == 'fit_cluster':
                x = _reshape(dataset.to_numpy())
                if not preprocess_props:
                    return x
                scaling_props = preprocess_props.get('scale', None)
                if not scaling_props:
                    return x
                else:
                    scaling_method = scaling_props.get('method', None)
                    return normalize(x, method=scaling_method)

            if any(col not in attributes for col in self.target):
                raise Exception(
                    "chosen target(s) to predict must exist in the dataset")

            y = pd.concat(
                [dataset.pop(x) for x in self.target],
                axis=1)  # remove target variable(s) from dataset & concat them
            x = _reshape(dataset.to_numpy())
            y = _reshape(y.to_numpy())
            logger.info(f"y shape: {y.shape} and x shape: {x.shape}")
            self._x_columns = dataset.columns.to_list()
            logger.info(f"X columns: {self._x_columns}")

            # handle data scaling
            if preprocess_props:
                scaling_props = preprocess_props.get('scale', None)
                if scaling_props:
                    scaling_method = scaling_props.get('method', None)
                    scaling_target = scaling_props.get('target', None)
                    if scaling_target == 'all':
                        x = normalize(x, method=scaling_method)
                        y = normalize(y, method=scaling_method)
                    elif scaling_target == 'inputs':
                        x = normalize(x, method=scaling_method)
                    elif scaling_target == 'outputs':
                        y = normalize(y, method=scaling_method)

            if target == 'evaluate':
                return x, y

            split_options = self.dataset_props.get('split', None)
            if not split_options:
                return x, y, None, None
            test_size = split_options.get('test_size')
            shuffle = split_options.get('shuffle')
            stratify = split_options.get('stratify')
            x_train, x_test, y_train, y_test = train_test_split(
                x,
                y,
                test_size=test_size,
                shuffle=shuffle,
                stratify=None
                if not stratify or stratify.lower() == "default" else stratify)

            return x_train, y_train, x_test, y_test

        except Exception as e:
            logger.exception(
                f"error occured while preparing the data: {e.args}")

    def get_evaluation(self, model, x_test, y_true, y_pred, y_score, **kwargs):
        try:
            res = evaluate_model(model_type=self.model_type,
                                 model=model,
                                 x_test=x_test,
                                 y_pred=y_pred,
                                 y_true=y_true,
                                 y_score=y_score,
                                 get_score_only=False,
                                 **kwargs)
        except Exception as e:
            logger.debug(e)
            res = evaluate_model(model_type=self.model_type,
                                 model=model,
                                 x_test=x_test,
                                 y_pred=y_pred,
                                 y_true=y_true,
                                 y_score=y_score,
                                 get_score_only=True,
                                 **kwargs)
        return res

    def fit(self, **kwargs):
        """fit a model

        Raises:
            Exception: [description]
        """
        x_train = None
        y_train = None
        x_test = None
        y_test = None

        cv_results = None
        eval_results = None
        cv_params = None
        hp_search_results = {}

        if self.model_type == 'clustering':
            x_train = self._prepare_clustering_data()
        else:
            x_train, y_train, x_test, y_test = self._prepare_fit_data()
        self.model, model_args = self._create_model(**kwargs)
        logger.info(
            f"executing a {self.model.__class__.__name__} algorithm...")

        # convert to multioutput if there is more than one target to predict:
        if self.model_type != 'clustering' and len(self.target) > 1:
            logger.info(
                f"predicting multiple targets detected. Hence, the model will be automatically "
                f"converted to a multioutput model")
            self.model = MultiOutputClassifier(self.model) \
                if self.model_type == 'classification' else MultiOutputRegressor(self.model)

        if self.model_type != 'clustering':
            cv_params = self.model_props.get('cross_validate', None)
            if not cv_params:
                logger.info(f"cross validation is not provided")
            else:
                # perform cross validation
                logger.info("performing cross validation ...")
                cv_results = cross_validate(estimator=self.model,
                                            X=x_train,
                                            y=y_train,
                                            **cv_params)

            hyperparams_props = self.model_props.get('hyperparameter_search',
                                                     None)
            if hyperparams_props:

                # perform hyperparameter search
                method = hyperparams_props.get('method', None)
                grid_params = hyperparams_props.get('parameter_grid', None)
                hp_args = hyperparams_props.get('arguments', None)
                logger.info(
                    f"Performing hyperparameter search using -> {method}")
                logger.info(
                    f"Grid parameters entered by the user: {grid_params}")
                logger.info(f"Additional hyperparameter arguments: {hp_args}")
                best_estimator, best_score, best_params = hyperparameter_search(
                    model=self.model,
                    method=method,
                    params=grid_params,
                    x_train=x_train,
                    y_train=y_train,
                    **hp_args)
                hp_search_results['best_params'] = best_params
                hp_search_results['best_score'] = best_score
                self.model = best_estimator

            self.model.fit(x_train, y_train)

        else:  # if the model type is clustering
            self.model.fit(x_train)

        saved = self._save_model(self.model)
        if saved:
            logger.info(
                f"model saved successfully and can be found in the {self.results_path} folder"
            )

        if self.model_type == 'clustering':
            eval_results = self.model.score(x_train)
        else:
            if x_test is None:
                logger.info(
                    f"no split options was provided. training score will be calculated"
                )
                eval_results = self.model.score(x_train, y_train)

            else:
                logger.info(
                    f"split option detected. The performance will be automatically evaluated "
                    f"using the test data portion")
                y_pred = self.model.predict(x_test)
                y_score = self.model.predict_proba(
                    x_test) if self.model_type == 'classification' else None
                eval_results = self.get_evaluation(model=self.model,
                                                   x_test=x_test,
                                                   y_true=y_test,
                                                   y_pred=y_pred,
                                                   y_score=y_score,
                                                   **kwargs)

        fit_description = {
            "model": self.model.__class__.__name__,
            "arguments": model_args if model_args else "default",
            "type": self.model_props['type'],
            "algorithm": self.model_props['algorithm'],
            "dataset_props": self.dataset_props,
            "model_props": self.model_props,
            "data_path": self.data_path,
            "train_data_shape": x_train.shape,
            "test_data_shape": None if x_test is None else x_test.shape,
            "train_data_size": x_train.shape[0],
            "test_data_size": None if x_test is None else x_test.shape[0],
            "results_path": str(self.results_path),
            "model_path": str(self.default_model_path),
            "target": None if self.model_type == 'clustering' else self.target,
            "results_on_test_data": eval_results,
            "hyperparameter_search_results": hp_search_results
        }
        if self.model_type == 'clustering':
            clustering_res = {
                "cluster_centers": self.model.cluster_centers_,
                "cluster_labels": self.model.labels_
            }
            fit_description['clustering_results'] = clustering_res

        if cv_params:
            cv_res = {
                "fit_time": cv_results['fit_time'].tolist(),
                "score_time": cv_results['score_time'].tolist(),
                "test_score": cv_results['test_score'].tolist()
            }
            fit_description['cross_validation_params'] = cv_params
            fit_description['cross_validation_results'] = cv_res

        try:
            logger.info(f"saving fit description to {self.description_file}")
            with open(self.description_file, 'w', encoding='utf-8') as f:
                json.dump(fit_description, f, ensure_ascii=False, indent=4)
        except Exception as e:
            logger.exception(
                f"Error while storing the fit description file: {e}")

    def evaluate(self, **kwargs):
        """
        evaluate a pre-fitted model and save results to a evaluation.json
        @return: None
        """
        x_val = None
        y_true = None
        eval_results = None

        try:
            model = self._load_model()
            if self.model_type != 'clustering':
                x_val, y_true = self._prepare_eval_data()
                y_pred = model.predict(x_val)
                y_score = model.predict_proba(
                    x_val) if self.model_type == 'classification' else None
                eval_results = self.get_evaluation(model=model,
                                                   x_test=x_val,
                                                   y_true=y_true,
                                                   y_pred=y_pred,
                                                   y_score=y_score,
                                                   **kwargs)
            else:
                x_val = self._prepare_clustering_data()
                y_pred = model.predict(x_val)
                eval_results = model.score(x_val, y_pred)

            logger.info(f"saving fit description to {self.evaluation_file}")
            with open(self.evaluation_file, 'w', encoding='utf-8') as f:
                json.dump(eval_results, f, ensure_ascii=False, indent=4)

        except Exception as e:
            logger.exception(f"error occured during evaluation: {e}")

    def predict(self):
        """
        use a pre-fitted model to make predictions and save them as csv
        @return: None
        """
        try:
            model = self._load_model(f=self.model_path)
            x_val = self._prepare_predict_data(
            )  # the same is used for clustering
            y_pred = model.predict(x_val)
            y_pred = _reshape(model.predict_proba(x_val)[:, 1]) if (
                type_of_target(y_pred) == 'binary'
                and self.model_type == 'classification') else _reshape(y_pred)
            logger.info(
                f"predictions shape: {y_pred.shape} | shape len: {len(y_pred.shape)}"
            )
            logger.info(f"predict on targets: {self.target}")
            df_pred = pd.DataFrame.from_dict({
                self.target[i]: y_pred[:,
                                       i] if len(y_pred.shape) > 1 else y_pred
                for i in range(len(self.target))
            })

            logger.info(f"saving the predictions to {self.prediction_file}")
            df_pred.to_csv(self.prediction_file)

        except Exception as e:
            logger.exception(f"Error while preparing predictions: {e}")

    @staticmethod
    def create_init_config_file(model_type=None,
                                model_name=None,
                                target=None,
                                *args,
                                **kwargs):
        path = configs.get('init_file_path', None)
        if not path:
            raise Exception("You need to provide a path for the init file")

        dataset_props = ModelTrainer.default_dataset_props
        model_props = ModelTrainer.default_model_props
        if model_type:
            logger.info(f"user selected model type = {model_type}")
            model_props['type'] = model_type
        if model_name:
            logger.info(f"user selected algorithm = {model_name}")
            model_props['algorithm'] = model_name

        logger.info(f"initalizing a default ModelTrainer.yaml in {path}")
        default_data = {
            "dataset":
            dataset_props,
            "model":
            model_props,
            "target": ['provide your target(s) here']
            if not target else [tg for tg in target.split()]
        }
        created = create_yaml(default_data, path)
        if created:
            logger.info(
                f"a default Model.yaml is created for you in {path}. "
                f"you just need to overwrite the values to meet your expectations"
            )
        else:
            logger.warning(
                f"something went wrong while initializing a default file")
Exemplo n.º 37
0
from flask import request, jsonify
import pandas as pd
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
import csv
import json

app = flask.Flask(__name__)
df = pd.read_csv("train_data1.csv")
df_X = df.iloc[:, 1:12].copy()  # Train Input
df_Y = df.iloc[:, 12:16].copy()  # Train Output
vendorArray = []
forest = RandomForestClassifier(n_estimators=100, random_state=1)
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)
multi_target_forest_updated = MultiOutputClassifier(forest, n_jobs=-1)
multi_target_forest.fit(df_X, df_Y)

df = pd.read_csv("train_data2.csv")
df_X = df.iloc[:, 1:12].copy()  # Train Input
df_Y = df.iloc[:, 12:16].copy()  # Train Output
# coursesArray = []
forest1 = RandomForestClassifier(n_estimators=100, random_state=1)
multi_target_forest1 = MultiOutputClassifier(forest1, n_jobs=-1)
multi_target_forest_updated1 = MultiOutputClassifier(forest1, n_jobs=-1)
multi_target_forest1.fit(df_X, df_Y)


def findVendor(index):
    vendors = {
        0: "ACE American Insurance Company",
        1: "American Agri-Business Insurance Company",