Пример #1
0
def compute_logloss(df_filenames, df_data):
    #STEP 1: replace values
    replacer = lambda x: max(float(min(x, 0.999999999999)), 0.0000000000000001)
    df_data = df_data.applymap(replacer)

    #STEP 2: rescale
    df_subsum = df_data.sum(axis=1)
    df_sum = pd.concat([
        df_subsum, df_subsum, df_subsum, df_subsum, df_subsum, df_subsum,
        df_subsum, df_subsum, df_subsum, df_subsum
    ],
                       axis=1)
    df_sum.columns = [
        'c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9'
    ]
    df_data = df_data / df_sum

    #STEP 3: logloss
    #load correct validationset labels
    labels = Input.load_validationset_labels()
    df_labels = pd.get_dummies(
        labels)  #to one-hot-encoding, DataFrame automatically
    df_labels.columns = [
        'c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9'
    ]

    #sort data to have same order as labels
    correct_order = Input.load_validationset_filenames()
    current_order = list(df_filenames.values)
    indices = [current_order.index(filename) for filename in correct_order]
    df_data = df_data.reindex(indices)
    df_data = df_data.reset_index(
    )  #reset index --> adds new indices, old indices become column 'index'
    df_data = df_data.drop('index', axis=1)  #remove this new column 'index'

    #select probabilities of correct classes only
    df_sparse_probs = df_data * df_labels
    probs = df_sparse_probs.values
    probs = list(chain.from_iterable(probs))  #flatten list
    probs = filter(lambda x: x != 0, probs)  #remove all zeros

    #apply log to them and take the average
    log_probs = [math.log(p) for p in probs]
    return -(np.mean(log_probs))
Пример #2
0
from sklearn.svm import SVC
from sklearn.svm import NuSVC
import numpy as np
from IO import Output
import pickle
from sklearn.svm import LinearSVC

'''
Helper function to use with the grouping of the dataframe, turns 3 rows of coordinates into a single row
'''
def transformXY(coords):
    return pd.Series(np.asarray(coords).ravel())

#Load the file names of the various datasets
trainset_filenames = Input.load_trainset_filenames()
validationset_filenames = Input.load_validationset_filenames()
traindata_filenames = Input.load_traindata_filenames()
testset_filenames = Input.load_testdata_filenames()

#Load the features
feat = pd.read_csv('skinTrainFeatures.csv', index_col = 0)

#Select the features for each dataset
x_trainset = feat.ix[trainset_filenames]
x_validationset = feat.ix[validationset_filenames]  
x_testset = feat.ix[testset_filenames]  
x_traindata = feat.ix[traindata_filenames]

#Load the labels for each dataset
y_trainset = np.asarray(Input.load_trainset_labels())
y_validationset = np.asarray(Input.load_validationset_labels())