def get_string_complex(ftype, data, alphabet=features.DNA, order=WORDSTRING_ORDER, gap=WORDSTRING_GAP, reverse=WORDSTRING_REVERSE): """Return complex StringFeatures. @param ftype Feature type, e.g. RealFeature, ByteFeature @param data Train/test data for feature creation @param alphabet Alphabet for feature creation @param order Order of the feature @param gap Gap of the feature @param reverse Is feature reverse? @return Dict with complex StringFeatures train/test """ feats = {} charfeat = features.StringCharFeatures(data['train'], alphabet) feat = eval('features.String' + ftype + 'Features(alphabet)') feat.obtain_from_char(charfeat, order - 1, order, gap, reverse) feats['train'] = feat charfeat = features.StringCharFeatures(data['test'], alphabet) feat = eval('features.String' + ftype + 'Features(alphabet)') feat.obtain_from_char(charfeat, order - 1, order, gap, reverse) feats['test'] = feat if ftype == 'Word' or ftype == 'Ulong': name = 'Sort' + ftype + 'String' return add_preproc(name, feats) else: return feats
def _stop_training(self): super(ShogunSVMClassifier, self)._stop_training() self.normalizer = _LabelNormalizer(self.labels) labels = self.normalizer.normalize(self.labels) # shogun expects float labels labels = sgFeatures.Labels(labels.astype(float)) features = sgFeatures.RealFeatures(self.data.transpose()) self.classifier.set_train_features(features, labels) self.classifier.train()
def bench_shogun(X, y, T, valid): # # .. Shogun .. # from shogun import Classifier, Features, Distance start = datetime.now() feat = Features.RealFeatures(X.T) distance = Distance.EuclidianDistance(feat, feat) labels = Features.Labels(y.astype(np.float64)) test_feat = Features.RealFeatures(T.T) knn = Classifier.KNN(n_neighbors, distance, labels) knn.train() score = np.mean(knn.classify(test_feat).get_labels() == valid) return score, datetime.now() - start
def _label(self, x): """Classify the input data 'x' """ test_features = sgFeatures.RealFeatures(x.transpose()) labels = self.classifier.label(test_features) if self.normalizer: return self.normalizer.revert(labels) else: return labels
def _label(self, x): """Classify the input data 'x' :param x: The input data to classify. :return: The corresponding labels for the input. """ test_features = sgFeatures.RealFeatures(x.transpose()) labels = self.classifier.label(test_features) if self.normalizer: return self.normalizer.revert(labels) else: return labels
def get_wd(data, order=WORDSTRING_ORDER): """Return WDFeatures. @param data Train/test data for feature creation @param order Order of the feature @return Dict with WDFeatures train/test """ feats = {} charfeat = features.StringCharFeatures(data['train'], features.DNA) bytefeat = features.StringByteFeatures(features.RAWDNA) bytefeat.obtain_from_char(charfeat, 0, 1, 0, False) feats['train'] = features.WDFeatures(bytefeat, order, order) charfeat = features.StringCharFeatures(data['test'], features.DNA) bytefeat = features.StringByteFeatures(features.RAWDNA) bytefeat.obtain_from_char(charfeat, 0, 1, 0, False) feats['test'] = features.WDFeatures(bytefeat, order, order) return feats
def train_attribute(attribute_id, C, split=0): from shogun import Classifier, Features, Kernel, Distance attribute_id = int(attribute_id) print "# attribute ", attributenames[attribute_id] C = float(C) print "# C ", C if split == 0: train_classes = loadstr( '/nfs3group/chlgrp/datasets/Animals_with_Attributes/trainclasses.txt' ) test_classes = loadstr( '/nfs3group/chlgrp/datasets/Animals_with_Attributes/testclasses.txt' ) else: classnames = loadstr( '/nfs3group/chlgrp/datasets/Animals_with_Attributes/classnames.txt' ) startid = (split - 1) * 10 stopid = split * 10 test_classes = classnames[startid:stopid] train_classes = classnames[0:startid] + classnames[stopid:] Xtrn, Ltrn = create_data(train_classes, attribute_id) Xtst, Ltst = create_data(test_classes, attribute_id) if min(Ltrn) == max(Ltrn): # only 1 class Lprior = mean(Ltrn) prediction = sign(Lprior) * ones(len(Ltst)) probabilities = 0.1 + 0.8 * 0.5 * (Lprior + 1.) * ones( len(Ltst)) # fallback return prediction, probabilities, Ltst #sg('loglevel', 'WARN') widths = {} for feature in all_features: traindata = array(Xtrn[feature][:, ::50], float) # used to be 5*offset trainfeat = Features.RealFeatures(traindata) DM = Distance.ChiSquareDistance(trainfeat, trainfeat).get_distance_matrix() widths[feature] = median(DM.flatten()) del traindata, trainfeat, DM s = Classifier.LibSVM() #sg('new_svm', 'LIBSVM') #sg('use_mkl', False) # we use fixed weights here #sg('clean_features', 'TRAIN') #sg('clean_features', 'TEST') Lplatt_trn = concatenate([Ltrn[i::10] for i in range(9)]) # 90% for training Lplatt_val = Ltrn[9::10] # remaining 10% for platt scaling feats_trn = Features.CombinedFeatures() feats_val = Features.CombinedFeatures() for feature in all_features: Xplatt_trn = concatenate([Xtrn[feature][:, i::10] for i in range(9)], axis=1) feats_trn.append_feature_obj( Features.RealFeatures(ascontiguousarray(Xplatt_trn))) #sg('add_features', 'TRAIN', Xplatt_trn) Xplatt_val = Xtrn[feature][:, 9::10] feats_val.append_feature_obj( Features.RealFeatures(ascontiguousarray(Xplatt_val))) #sg('add_features', 'TEST', Xplatt_val) del Xplatt_trn, Xplatt_val, Xtrn[feature] labels_trn = Features.Labels(Lplatt_trn) #sg('set_labels', 'TRAIN', Lplatt_trn) kernel = Kernel.CombinedKernel() #sg('set_kernel', 'COMBINED', 5000) for featureset in all_features: kernel.append_kernel(Kernel.Chi2Kernel(5000, widths[featureset] / 5.)) #sg('add_kernel', 1., 'CHI2', 'REAL', 10, widths[featureset]/5. ) kernel.init(feats_trn, feats_trn) K = kernel.get_kernel_matrix() K.tofile('/scratch/chl/cvfold%d_C%g_%02d-trn.kernel' % (split, C, attribute_id)) del K s.set_max_train_time(600 * 60.) #sg('svm_max_train_time', 600*60.) # one hour should be plenty s.set_C(C, C) #sg('c', C) s.set_kernel(kernel) s.set_labels(labels_trn) #sg('init_kernel', 'TRAIN') try: s.train() #sg('train_classifier') except (RuntimeWarning, RuntimeError ): # can't train, e.g. all samples have the same labels Lprior = mean(Ltrn) prediction = sign(Lprior) * ones(len(Ltst)) probabilities = 0.1 + 0.8 * 0.5 * (Lprior + 1.) * ones(len(Ltst)) savetxt('./DAP/cvfold%d_C%g_%02d.txt' % (split, C, attribute_id), prediction) savetxt('./DAP/cvfold%d_C%g_%02d.prob' % (split, C, attribute_id), probabilities) savetxt('./DAP/cvfold%d_C%g_%02d.labels' % (split, C, attribute_id), Ltst) return prediction, probabilities, Ltst bias = s.get_bias() alphas = s.get_alphas() #[bias, alphas]=sg('get_svm') #print bias,alphas kernel.init(feats_trn, feats_val) K = kernel.get_kernel_matrix() K.tofile('/scratch/chl/cvfold%d_C%g_%02d-val.kernel' % (split, C, attribute_id)) del K #sg('init_kernel', 'TEST') try: prediction = s.classify().get_labels() #prediction=sg('classify') platt_params = SigmoidTrain(prediction, Lplatt_val) probabilities = SigmoidPredict(prediction, platt_params) savetxt('./DAP/cvfold%d_C%g_%02d-val.txt' % (split, C, attribute_id), prediction) savetxt('./DAP/cvfold%d_C%g_%02d-val.prob' % (split, C, attribute_id), probabilities) savetxt( './DAP/cvfold%d_C%g_%02d-val.labels' % (split, C, attribute_id), Lplatt_val) savetxt('./DAP/cvfold%d_C%g_%02d-val.platt' % (split, C, attribute_id), platt_params) #print '#train-perf ',attribute_id,C,mean((prediction*Lplatt_val)>0),mean(Lplatt_val>0) #print '#platt-perf ',attribute_id,C,mean((sign(probabilities-0.5)*Lplatt_val)>0),mean(Lplatt_val>0) except RuntimeError: Lprior = mean(Ltrn) prediction = sign(Lprior) * ones(len(Ltst)) probabilities = 0.1 + 0.8 * 0.5 * (Lprior + 1.) * ones(len(Ltst)) print >> sys.stderr, "#Error during testing. Using constant platt scaling" platt_params = [1., 0.] # ----------------------------- now apply to test classes ------------------ feats_tst = Features.CombinedFeatures() #sg('clean_features', 'TEST') for feature in all_features: feats_tst.append_feature_obj( Features.RealFeatures(ascontiguousarray(Xtst[feature]))) del Xtst[feature] kernel.init(feats_trn, feats_tst) K = kernel.get_kernel_matrix() K.tofile('/scratch/chl/cvfold%d_C%g_%02d-tst.kernel' % (split, C, attribute_id)) del K #sg('init_kernel', 'TEST') prediction = s.classify().get_labels() #prediction=sg('classify') probabilities = SigmoidPredict(prediction, platt_params) savetxt('./DAP/cvfold%d_C%g_%02d.txt' % (split, C, attribute_id), prediction) savetxt('./DAP/cvfold%d_C%g_%02d.prob' % (split, C, attribute_id), probabilities) savetxt('./DAP/cvfold%d_C%g_%02d.labels' % (split, C, attribute_id), Ltst) #print '#test-perf ',attribute_id,C,mean((prediction*Ltst)>0),mean(Ltst>0) #print '#platt-perf ',attribute_id,C,mean((sign(probabilities-0.5)*Ltst)>0),mean(Ltst>0) return prediction, probabilities, Ltst