def compute_logloss(df_filenames, df_data): #STEP 1: replace values replacer = lambda x: max(float(min(x, 0.999999999999)), 0.0000000000000001) df_data = df_data.applymap(replacer) #STEP 2: rescale df_subsum = df_data.sum(axis=1) df_sum = pd.concat([ df_subsum, df_subsum, df_subsum, df_subsum, df_subsum, df_subsum, df_subsum, df_subsum, df_subsum, df_subsum ], axis=1) df_sum.columns = [ 'c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9' ] df_data = df_data / df_sum #STEP 3: logloss #load correct validationset labels labels = Input.load_validationset_labels() df_labels = pd.get_dummies( labels) #to one-hot-encoding, DataFrame automatically df_labels.columns = [ 'c0', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9' ] #sort data to have same order as labels correct_order = Input.load_validationset_filenames() current_order = list(df_filenames.values) indices = [current_order.index(filename) for filename in correct_order] df_data = df_data.reindex(indices) df_data = df_data.reset_index( ) #reset index --> adds new indices, old indices become column 'index' df_data = df_data.drop('index', axis=1) #remove this new column 'index' #select probabilities of correct classes only df_sparse_probs = df_data * df_labels probs = df_sparse_probs.values probs = list(chain.from_iterable(probs)) #flatten list probs = filter(lambda x: x != 0, probs) #remove all zeros #apply log to them and take the average log_probs = [math.log(p) for p in probs] return -(np.mean(log_probs))
validationset_filenames = Input.load_validationset_filenames() traindata_filenames = Input.load_traindata_filenames() testset_filenames = Input.load_testdata_filenames() #Load the features feat = pd.read_csv('skinTrainFeatures.csv', index_col = 0) #Select the features for each dataset x_trainset = feat.ix[trainset_filenames] x_validationset = feat.ix[validationset_filenames] x_testset = feat.ix[testset_filenames] x_traindata = feat.ix[traindata_filenames] #Load the labels for each dataset y_trainset = np.asarray(Input.load_trainset_labels()) y_validationset = np.asarray(Input.load_validationset_labels()) y_traindata = np.asarray(Input.load_traindata_labels()) #restructure the features so they can be used in the SVM x_trainset = x_trainset.groupby(x_trainset.index).apply(transformXY) x_validationset = x_validationset.groupby(x_validationset.index).apply(transformXY) x_testset = x_testset.groupby(x_testset.index).apply(transformXY) x_traindata = x_traindata.groupby(x_traindata.index).apply(transformXY) #Normalise the data df = x_traindata.iloc[:,1:] df_norm = (df - df.mean(axis=1)) / (df.max(axis=1) - df.min(axis=1)) x_traindata = df_norm #Train classifier clf = OneVsRestClassifier(SVC(C=0.1,kernel='poly', probability=True))