示例#1
0
def run_cv(s, fold, base_folder, params, dataset, classifier_builder):

    if file_exists(base_folder + "/models/model_" + str(fold) + "_model"):
        print("will not run")
        return

    if fold == -1:
        if file_exists(base_folder + "/models/model_final_net"):
            print("will not run")
            return

    # set random seeds
    set_seeds(s)

    create_dir_structure(base_folder)

    # build the final model
    if fold == -1:
        classifier = classifier_builder(**params)

        classifier.train(dataset, lc_file=base_folder + "/lcs/lc_fold_-1")
        ids, predictions, classes_d = classifier.evaluate(dataset)

        CrossValidation.write_results(ids, predictions, classes_d, -1,
                                      base_folder + "/predictions/")
        classifier.save_model(base_folder + "/models/model_final")

        ids, projection, classes = classifier.get_projection(dataset)
        cross_validation.save_projections(
            projection, base_folder + "/projections/fold_train_" + str(fold),
            ids)

        for class_name in classes_d.columns:
            ids, projection, classes = classifier.get_projection(
                dataset, class_name)
            cross_validation.save_projections(
                projection, base_folder + "/projections/class_fold_train_" +
                class_name + "_" + str(fold), ids)

    else:
        # run 1 fold of 10 cv, saving each model
        cross_valid = CrossValidation(dataset, classifier_builder, params,
                                      base_folder)
        cross_valid.run(fold)
rna.setImputDimNeurons(30)
rna.setNumberNeuronsHiddenLayer(31)
rna.setActivationFunctionHiddenLayer("tanh")
rna.setNumberNeuronsOutputLayer(1)
rna.setActivationFunctionOutputLayer("tanh")
rna_classifier = RnaClassifier()
rna_classifier.setRna(rna)

#PREPROCESSADOR PARA ATRIBUTOS CATEGORICOS
preprocessor = Preprocessor()
preprocessor.setColumnsCategory(['protocol_type', 'service', 'flag'])

evaluate = EvaluateModule()

cross = CrossValidation()

#DEFINIR A ITERACAO QUE O CROSS VALIDATION ESTA
cross.setIteration(1)

cross.setPreprocessor(preprocessor)

cross.setFilePath("../../bases/sub_bases_nslkdd_30_attribute/")

cross.setResultPath("../../results/30_attribute/rna_oculta_31_time/")

cross.setClassifier(rna_classifier)

cross.setEvaluateModule(evaluate)

cross.run()
# from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import numpy as np

from feature_engineering import FeatureEngineering
from cross_validation import CrossValidation
from multi_log_loss import multi_log_loss

f = FeatureEngineering('../data/gender_age_train.csv',
                       '../data/gender_age_test.csv',
                       'device_id',
                       wide_files=[#'../features/apps_per_event.csv', '../features/avg_position.csv',
                                   #'../features/count_by_hour.csv', '../features/count_by_period.csv',
                                   '../features/event_counts.csv', '../features/sd_position.csv'],
                       long_files=[#'../features/active_app_category_counts.csv',
                                   #'../features/installed_app_category_counts.csv',
                                   '../features/phone_brand.csv'])
labels, features, colnames = f.extract_features()
labels.set_index(np.arange(labels.shape[0]), inplace=True)
colnames.set_index(np.arange(colnames.shape[0]), inplace=True)
train_filter = [i for i, x in enumerate(labels['age'].tolist()) if not np.isnan(x)]
test_filter = [i for i, x in enumerate(labels['age'].tolist()) if np.isnan(x)]

cv = CrossValidation(features[train_filter, :],
                     labels.ix[train_filter, 'group'],
                     features[test_filter, :],
                     multi_log_loss)
model = MultinomialNB()
model.predict = model.predict_proba
out = cv.run(model, 'test')
    @staticmethod
    def filter_X(X):
        # pass list so a DataFrame is returned
        return X.loc[:, ['age']]


def my_roc(actual, predicted):
    actual = [int(x) for x in actual]
    return roc_auc_score(actual, predicted)


train = pd.read_csv('gender_age_train.csv')
train['train_fl'] = True
train['gender'] = [float(x == 'M') for x in train['gender']]
test = pd.read_csv('gender_age_test.csv')
test['gender'] = ''
test['age'] = -1
test['group'] = ''
test['train_fl'] = False
data = pd.concat([train, test])

cv = CrossValidation(data,
                     my_roc,
                     'gender',
                     'train_fl',
                     id_col='device_id',
                     logged=True)
model = MyLasso()
cv.run(model, 'test_age_gender')
示例#5
0
def _train(current_experiment, train_data_folder_path,
           train_labels_folder_path, train_ids_path):
    # this must be imported after setting CUDA_VISIBLE_DEVICES environment variable, otherwise it won't work
    from cross_validation import CrossValidation
    cv = CrossValidation(current_experiment)
    cv.run(train_data_folder_path, train_labels_folder_path, train_ids_path)