def __assign_shogun_version(): """Assign shogun versions """ if 'shogun' in versions: return import shogun.Classifier as __sc versions['shogun:rev'] = __sc.Version_get_version_revision() ver = __sc.Version_get_version_release().lstrip('v') versions['shogun:full'] = ver if '_' in ver: ver = ver[:ver.index('_')] versions['shogun'] = ver
def _run_lda (): """Run Linear Discriminant Analysis classifier.""" params={ 'name': 'LDA', 'type': 'lda', 'gamma': 0.1, 'num_threads': 1, 'data': dataop.get_clouds(2), 'feature_class': 'simple', 'feature_type': 'Real', 'label_type': 'twoclass', 'accuracy': 1e-7 } feats=featop.get_features( params['feature_class'], params['feature_type'], params['data']) params['labels'], labels=dataop.get_labels( feats['train'].get_num_vectors(), params['label_type']) lda=classifier.LDA(params['gamma'], feats['train'], labels) lda.parallel.set_num_threads(params['num_threads']) lda.train() lda.set_features(feats['test']) params['classified']=lda.classify().get_labels() output=fileop.get_output(category.CLASSIFIER, params) fileop.write(category.CLASSIFIER, output)
def _run_perceptron (): """Run Perceptron classifier.""" params={ 'name': 'Perceptron', 'type': 'perceptron', 'num_threads': 1, 'learn_rate': .1, 'max_iter': 1000, 'data': dataop.get_clouds(2), 'feature_class': 'simple', 'feature_type': 'Real', 'label_type': 'twoclass', 'accuracy': 1e-7 } feats=featop.get_features( params['feature_class'], params['feature_type'], params['data']) num_vec=feats['train'].get_num_vectors() params['labels'], labels=dataop.get_labels(num_vec, params['label_type']) perceptron=classifier.Perceptron(feats['train'], labels) perceptron.parallel.set_num_threads(params['num_threads']) perceptron.set_learn_rate(params['learn_rate']) perceptron.set_max_iter(params['max_iter']) perceptron.train() params['bias']=perceptron.get_bias() perceptron.set_features(feats['test']) params['classified']=perceptron.classify().get_labels() output=fileop.get_output(category.CLASSIFIER, params) fileop.write(category.CLASSIFIER, output)
def bench_shogun(X, y, T, valid): # # .. Shogun .. # from shogun import Classifier, Features, Distance start = datetime.now() feat = Features.RealFeatures(X.T) distance = Distance.EuclidianDistance(feat, feat) labels = Features.Labels(y.astype(np.float64)) test_feat = Features.RealFeatures(T.T) knn = Classifier.KNN(n_neighbors, distance, labels) knn.train() score = np.mean(knn.classify(test_feat).get_labels() == valid) return score, datetime.now() - start
def __check_shogun(bottom_version, custom_versions=[]): """Check if version of shogun is high enough (or custom known) to be enabled in the testsuite Parameters ---------- bottom_version : int Bottom version which must be satisfied custom_versions : list of int Arbitrary list of versions which could got patched for a specific issue """ import shogun.Classifier as __sc ver = __sc.Version_get_version_revision() __assign_shogun_version() if (ver in custom_versions) or (ver >= bottom_version): return True else: raise ImportError, 'Version %s is smaller than needed %s' % \ (ver, bottom_version)
def _run_knn (): """Run K-Nearest-Neighbour classifier. """ params={ 'name': 'EuclidianDistance', 'data': dataop.get_clouds(2), 'feature_class': 'simple', 'feature_type': 'Real' } feats=featop.get_features( params['feature_class'], params['feature_type'], params['data']) dfun=eval(params['name']) distance=dfun(feats['train'], feats['train']) output=fileop.get_output(category.DISTANCE, params) params={ 'name': 'KNN', 'type': 'knn', 'num_threads': 1, 'k': 3, 'label_type': 'twoclass', 'accuracy': 1e-8 } params['labels'], labels=dataop.get_labels( feats['train'].get_num_vectors(), params['label_type']) knn=classifier.KNN(params['k'], distance, labels) knn.parallel.set_num_threads(params['num_threads']) knn.train() distance.init(feats['train'], feats['test']) params['classified']=knn.classify().get_labels() output.update(fileop.get_output(category.CLASSIFIER, params)) fileop.write(category.CLASSIFIER, output)
def parse_config_file(config_files, feature_file_suffix='.bed'): """ Return classifiers. """ valid_words = ['CLASSIFIER', 'KERNEL', 'KERNEL_NAME', 'KERNEL_NORM', 'FEATURE', 'BIN_FEATURE', 'CON_FEATURE', 'PP_FEATURE', 'SEQ', 'END', 'C', 'MKL_NORM', 'REV_COMP'] classifiers = [] for config_file in config_files: name = None; kernels = []; c = None bin_features = []; con_features = []; pp_features = [] seqs = []; ks = []; kern_names = [] kern_norms = [] rev_comps = [] mkl_norm = 2 line_num = 0 for line in open(config_file): line_num += 1 line = line.strip() if line.startswith('#') or not line: continue t = line.split() if t[0] not in valid_words: sys.exit('ERROR! %s not recognized in line %d: %s.' % (t[0], line_num, line)) elif t[0] == 'CLASSIFIER': name = '_'.join(t[1:]) elif t[0] == 'KERNEL': bin_features.append([]) con_features.append([]) pp_features.append([]) seqs.append([]) kern_norms.append(None) kern_names.append(None) rev_comps.append(False) ks.append(None) if t[1] == 'Linear': kernels.append(LinearKernel()) elif t[1].startswith('CommWordString_') or t[1].startswith('Spectrum_'): k = int(t[1].split('_')[-1]) ks[-1] = k kernels.append(CommWordStringKernel(10, False)) elif t[1].startswith('Gaussian_'): sigma = float(t[1].split('_')[-1]) kernels.append(GaussianKernel(10, sigma)) else: sys.exit('ERROR! %s is not a valid kernel.' % t[1]) elif t[0] == 'KERNEL_NORM': if t[1] == 'VarianceKernelNormalizer': kern_norms[-1] = VarianceKernelNormalizer() kernels[-1].set_normalizer(VarianceKernelNormalizer()) elif t[1] == 'SqrtDiagKernelNormalizer': kern_norms[-1] = SqrtDiagKernelNormalizer() kernels[-1].set_normalizer(SqrtDiagKernelNormalizer()) elif t[1] == 'AvgDiagKernelNormalizer': kern_norms[-1] = AvgDiagKernelNormalizer() kernels[-1].set_normalizer(AvgDiagKernelNormalizer()) else: sys.exit('ERROR! %s is not a recognized kernel normalizer.' % t[1]) elif t[0] == 'KERNEL_NAME': kern_names[-1] = t[1] elif t[0] == 'REV_COMP': rev_comps[-1] = bool(int(t[1])) elif t[0] == 'BIN_FEATURE' or t[0] == 'FEATURE': # for BW compatibility bf_path = t[1] if os.path.isdir(bf_path): for fn in os.listdir(bf_path): if not fn.endswith(feature_file_suffix): continue full_fn = bf_path + ('/' if bf_path[-1] != '/' else '') + fn if full_fn not in bin_features[-1]: bin_features[-1].append(full_fn) else: bin_features[-1].append(bf_path) elif t[0] == 'CON_FEATURE': cf_path = t[1] if os.path.isdir(cf_path): for fn in os.listdir(cf_path): if not fn.endswith(feature_file_suffix): continue full_fn = cf_path + ('/' if cf_path[-1] != '/' else '') + fn if full_fn not in con_features[-1]: con_features[-1].append(full_fn) else: con_features[-1].append(cf_path) elif t[0] == 'PP_FEATURE': pf_path = t[1] if os.path.isdir(pf_path): for fn in os.listdir(pf_path): if not fn.endswith(feature_file_suffix): continue full_fn = pf_path + ('/' if pf_path[-1] != '/' else '') + fn if full_fn not in pp_features[-1]: pp_features[-1].append(full_fn) else: pp_features[-1].append(pf_path) elif t[0] == 'SEQ': seqs[-1].append(t[1]) elif t[0] == 'C': c = [float(x) for x in t[1:]] elif t[0] == 'MKL_NORM': mkl_norm = int(t[1]) elif t[0] == 'END': print '\t', name classifiers.append(Classifier(name, kernels, bin_features, con_features, pp_features=pp_features, seqs=seqs, kern_norms=kern_norms, ks=ks, c=c, kern_names=kern_names, mkl_norm=mkl_norm, rev_comps=rev_comps)) name = None; kernels = []; c = None bin_features = []; con_features; pp_features = [] seqs = []; ks = [] kern_norms = [] kern_names = [] rev_comps = [] mkl_norm = 2 return classifiers
def train_attribute(attribute_id, C, split=0): from shogun import Classifier, Features, Kernel, Distance attribute_id = int(attribute_id) print "# attribute ", attributenames[attribute_id] C = float(C) print "# C ", C if split == 0: train_classes = loadstr( '/nfs3group/chlgrp/datasets/Animals_with_Attributes/trainclasses.txt' ) test_classes = loadstr( '/nfs3group/chlgrp/datasets/Animals_with_Attributes/testclasses.txt' ) else: classnames = loadstr( '/nfs3group/chlgrp/datasets/Animals_with_Attributes/classnames.txt' ) startid = (split - 1) * 10 stopid = split * 10 test_classes = classnames[startid:stopid] train_classes = classnames[0:startid] + classnames[stopid:] Xtrn, Ltrn = create_data(train_classes, attribute_id) Xtst, Ltst = create_data(test_classes, attribute_id) if min(Ltrn) == max(Ltrn): # only 1 class Lprior = mean(Ltrn) prediction = sign(Lprior) * ones(len(Ltst)) probabilities = 0.1 + 0.8 * 0.5 * (Lprior + 1.) * ones( len(Ltst)) # fallback return prediction, probabilities, Ltst #sg('loglevel', 'WARN') widths = {} for feature in all_features: traindata = array(Xtrn[feature][:, ::50], float) # used to be 5*offset trainfeat = Features.RealFeatures(traindata) DM = Distance.ChiSquareDistance(trainfeat, trainfeat).get_distance_matrix() widths[feature] = median(DM.flatten()) del traindata, trainfeat, DM s = Classifier.LibSVM() #sg('new_svm', 'LIBSVM') #sg('use_mkl', False) # we use fixed weights here #sg('clean_features', 'TRAIN') #sg('clean_features', 'TEST') Lplatt_trn = concatenate([Ltrn[i::10] for i in range(9)]) # 90% for training Lplatt_val = Ltrn[9::10] # remaining 10% for platt scaling feats_trn = Features.CombinedFeatures() feats_val = Features.CombinedFeatures() for feature in all_features: Xplatt_trn = concatenate([Xtrn[feature][:, i::10] for i in range(9)], axis=1) feats_trn.append_feature_obj( Features.RealFeatures(ascontiguousarray(Xplatt_trn))) #sg('add_features', 'TRAIN', Xplatt_trn) Xplatt_val = Xtrn[feature][:, 9::10] feats_val.append_feature_obj( Features.RealFeatures(ascontiguousarray(Xplatt_val))) #sg('add_features', 'TEST', Xplatt_val) del Xplatt_trn, Xplatt_val, Xtrn[feature] labels_trn = Features.Labels(Lplatt_trn) #sg('set_labels', 'TRAIN', Lplatt_trn) kernel = Kernel.CombinedKernel() #sg('set_kernel', 'COMBINED', 5000) for featureset in all_features: kernel.append_kernel(Kernel.Chi2Kernel(5000, widths[featureset] / 5.)) #sg('add_kernel', 1., 'CHI2', 'REAL', 10, widths[featureset]/5. ) kernel.init(feats_trn, feats_trn) K = kernel.get_kernel_matrix() K.tofile('/scratch/chl/cvfold%d_C%g_%02d-trn.kernel' % (split, C, attribute_id)) del K s.set_max_train_time(600 * 60.) #sg('svm_max_train_time', 600*60.) # one hour should be plenty s.set_C(C, C) #sg('c', C) s.set_kernel(kernel) s.set_labels(labels_trn) #sg('init_kernel', 'TRAIN') try: s.train() #sg('train_classifier') except (RuntimeWarning, RuntimeError ): # can't train, e.g. all samples have the same labels Lprior = mean(Ltrn) prediction = sign(Lprior) * ones(len(Ltst)) probabilities = 0.1 + 0.8 * 0.5 * (Lprior + 1.) * ones(len(Ltst)) savetxt('./DAP/cvfold%d_C%g_%02d.txt' % (split, C, attribute_id), prediction) savetxt('./DAP/cvfold%d_C%g_%02d.prob' % (split, C, attribute_id), probabilities) savetxt('./DAP/cvfold%d_C%g_%02d.labels' % (split, C, attribute_id), Ltst) return prediction, probabilities, Ltst bias = s.get_bias() alphas = s.get_alphas() #[bias, alphas]=sg('get_svm') #print bias,alphas kernel.init(feats_trn, feats_val) K = kernel.get_kernel_matrix() K.tofile('/scratch/chl/cvfold%d_C%g_%02d-val.kernel' % (split, C, attribute_id)) del K #sg('init_kernel', 'TEST') try: prediction = s.classify().get_labels() #prediction=sg('classify') platt_params = SigmoidTrain(prediction, Lplatt_val) probabilities = SigmoidPredict(prediction, platt_params) savetxt('./DAP/cvfold%d_C%g_%02d-val.txt' % (split, C, attribute_id), prediction) savetxt('./DAP/cvfold%d_C%g_%02d-val.prob' % (split, C, attribute_id), probabilities) savetxt( './DAP/cvfold%d_C%g_%02d-val.labels' % (split, C, attribute_id), Lplatt_val) savetxt('./DAP/cvfold%d_C%g_%02d-val.platt' % (split, C, attribute_id), platt_params) #print '#train-perf ',attribute_id,C,mean((prediction*Lplatt_val)>0),mean(Lplatt_val>0) #print '#platt-perf ',attribute_id,C,mean((sign(probabilities-0.5)*Lplatt_val)>0),mean(Lplatt_val>0) except RuntimeError: Lprior = mean(Ltrn) prediction = sign(Lprior) * ones(len(Ltst)) probabilities = 0.1 + 0.8 * 0.5 * (Lprior + 1.) * ones(len(Ltst)) print >> sys.stderr, "#Error during testing. Using constant platt scaling" platt_params = [1., 0.] # ----------------------------- now apply to test classes ------------------ feats_tst = Features.CombinedFeatures() #sg('clean_features', 'TEST') for feature in all_features: feats_tst.append_feature_obj( Features.RealFeatures(ascontiguousarray(Xtst[feature]))) del Xtst[feature] kernel.init(feats_trn, feats_tst) K = kernel.get_kernel_matrix() K.tofile('/scratch/chl/cvfold%d_C%g_%02d-tst.kernel' % (split, C, attribute_id)) del K #sg('init_kernel', 'TEST') prediction = s.classify().get_labels() #prediction=sg('classify') probabilities = SigmoidPredict(prediction, platt_params) savetxt('./DAP/cvfold%d_C%g_%02d.txt' % (split, C, attribute_id), prediction) savetxt('./DAP/cvfold%d_C%g_%02d.prob' % (split, C, attribute_id), probabilities) savetxt('./DAP/cvfold%d_C%g_%02d.labels' % (split, C, attribute_id), Ltst) #print '#test-perf ',attribute_id,C,mean((prediction*Ltst)>0),mean(Ltst>0) #print '#platt-perf ',attribute_id,C,mean((sign(probabilities-0.5)*Ltst)>0),mean(Ltst>0) return prediction, probabilities, Ltst