def _union_train(self, prepared_data, param): """ perform inner training by processing the tree """ normalizer = MultitaskKernelNormalizer(prepared_data.task_vector_nums) # set similarity for task_name_lhs in prepared_data.get_task_names(): for task_name_rhs in prepared_data.get_task_names(): similarity = 1.0 normalizer.set_task_similarity( prepared_data.name_to_id(task_name_lhs), prepared_data.name_to_id(task_name_rhs), similarity) lab = shogun_factory.create_labels(prepared_data.labels) print "creating empty kernel" kernel = shogun_factory.create_kernel(prepared_data.examples, param) print "setting normalizer" kernel.set_normalizer(normalizer) kernel.init_normalizer() svm = shogun_factory.create_svm(param, kernel, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # train SVM svm.train() return svm
def _union_train(self, prepared_data, param): """ perform inner training by processing the tree """ normalizer = MultitaskKernelNormalizer(prepared_data.task_vector_nums) # set similarity for task_name_lhs in prepared_data.get_task_names(): for task_name_rhs in prepared_data.get_task_names(): similarity = 1.0 normalizer.set_task_similarity(prepared_data.name_to_id(task_name_lhs), prepared_data.name_to_id(task_name_rhs), similarity) lab = shogun_factory.create_labels(prepared_data.labels) print "creating empty kernel" kernel = shogun_factory.create_kernel(prepared_data.examples, param) print "setting normalizer" kernel.set_normalizer(normalizer) kernel.init_normalizer() svm = shogun_factory.create_svm(param, kernel, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # train SVM svm.train() return svm
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ for task_id in train_data.keys(): print "task_id:", task_id root = param.taxonomy.data grey_nodes = [root] # top-down processing of taxonomy for node in root.get_leaves(): ##################################################### # train predictor ##################################################### parent_node = node.get_nearest_neighbor() cost = param.cost (examples, labels) = self.get_data(parent_node, train_data) # create shogun data objects k_parent = shogun_factory_new.create_kernel(examples, param) lab_parent = shogun_factory_new.create_labels(labels) parent_svm = SVMLight(cost, k_parent, lab_parent) parent_svm.train() ##################################################### # train predictors ##################################################### (examples, labels) = self.get_data(node, train_data) # create shogun data objects k = shogun_factory_new.create_kernel(examples, param) lab = shogun_factory_new.create_labels(labels) # regularize vs parent predictor weight = param.transform print "current edge_weight:", weight, " ,name:", node.name svm = DomainAdaptationSVM(cost, k, lab, parent_svm, weight) svm.train() # attach svm to node node.predictor = svm ##################################################### # Wrap things up ##################################################### # wrap up predictors for later use predictors = {} for leaf in root.get_leaves(): predictors[leaf.name] = leaf.predictor assert (leaf.predictor != None) sym_diff_keys = set(train_data.keys()).symmetric_difference( set(predictors.keys())) assert len( sym_diff_keys ) == 0, "symmetric difference between keys non-empty: " + str( sym_diff_keys) return predictors
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ assert(param.base_similarity >= 1) # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) # load data #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt") f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt") #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt") #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt") num_lines = int(f.readline().strip()) task_distances = numpy.zeros((num_lines, num_lines)) name_to_id = {} for (i, line) in enumerate(f): tokens = line.strip().split("\t") name = str(tokens[0]) name_to_id[name] = i entry = numpy.array([v for (j,v) in enumerate(tokens) if j!=0]) assert len(entry)==num_lines, "len_entry %i, num_lines %i" % (len(entry), num_lines) task_distances[i,:] = entry # cut relevant submatrix active_ids = [name_to_id[name] for name in data.get_task_names()] tmp_distances = task_distances[active_ids, :] tmp_distances = tmp_distances[:, active_ids] print "distances ", tmp_distances.shape # normalize distances task_distances = task_distances / numpy.max(tmp_distances) similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = param.base_similarity - task_distances[name_to_id[task_name_lhs], name_to_id[task_name_rhs]] normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs),data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, param, svm) return svms
def test_data(): ################################################################## # select MSS ################################################################## mss = expenv.MultiSplitSet.get(379) ################################################################## # data ################################################################## # fetch data instance_set = mss.get_train_data(-1) # prepare data data = PreparedMultitaskData(instance_set, shuffle=True) # set parameters param = Options() param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = 4 param.cost = 1.0 param.transform = 1.0 param.id = 666 param.freeze() ################################################################## # taxonomy ################################################################## taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data) support = numpy.linspace(0, 100, 4) distances = [[0, 1, 2, 2], [1, 0, 2, 2], [2, 2, 0, 1], [2, 2, 1, 0]] # create tree normalizer tree_normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_names) task_names = data.get_task_names() FACTOR = 1.0 # init gamma matrix gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for t1_name in task_names: for t2_name in task_names: similarity = taxonomy.compute_node_similarity(taxonomy.get_id(t1_name), taxonomy.get_id(t2_name)) gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] = similarity helper.save("/tmp/gammas", gammas) gammas = gammas * FACTOR cost = param.cost * numpy.sqrt(FACTOR) print gammas ########## # regular normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) for t1_name in task_names: for t2_name in task_names: similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity) ################################################################## # Train SVMs ################################################################## # create shogun objects wdk_tree = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) wdk_tree.set_normalizer(tree_normalizer) wdk_tree.init_normalizer() print "--->",wdk_tree.get_normalizer().get_name() svm_tree = SVMLight(cost, wdk_tree, lab) svm_tree.set_linadd_enabled(False) svm_tree.set_batch_computation_enabled(False) svm_tree.train() del wdk_tree del tree_normalizer print "finished training tree-norm SVM:", svm_tree.get_objective() wdk = shogun_factory.create_kernel(data.examples, param) wdk.set_normalizer(normalizer) wdk.init_normalizer() print "--->",wdk.get_normalizer().get_name() svm = SVMLight(cost, wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "finished training manually set SVM:", svm.get_objective() alphas_tree = svm_tree.get_alphas() alphas = svm.get_alphas() assert(len(alphas_tree)==len(alphas)) for i in xrange(len(alphas)): assert(abs(alphas_tree[i] - alphas[i]) < 0.0001) print "success: all alphas are the same"
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # support support = numpy.linspace(0, 1, 5) # set normalizer normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_nums) # fetch taxonomy from parameter object taxonomy = param.taxonomy.data taxonomy.plot() import os os.system("evince demo.png &") # compute distances distances = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for (i,task_name_lhs) in enumerate(data.get_task_names()): for (j, task_name_rhs) in enumerate(data.get_task_names()): distances[i,j] = task_similarities.compute_hop_distance(taxonomy, task_name_lhs, task_name_rhs) # normalize distances distances = distances / numpy.max(distances) # set distances for (i,task_name_lhs) in enumerate(data.get_task_names()): for (j, task_name_rhs) in enumerate(data.get_task_names()): normalizer.set_task_distance(i, j, distances[i,j]) # assign normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() svm = None debug_weights = {} num_subk = base_wdk.get_num_subkernels() print "num subkernels:", num_subk #print "subkernel weights:", base_wdk.get_subkernel_weights() debug_weights["before"] = [normalizer.get_beta(i) for i in range(num_subk)] print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: num_threads = 4 svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #GLPK) #DIRECT) #NEWTON)#ST_CPLEX) svm.set_C(param.cost, param.cost) svm.set_kernel(base_wdk) svm.set_labels(lab) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() #print "subkernel weights (after):", base_wdk.get_subkernel_weights() else: # create SVM (disable unsupported optimizations) svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "svm objective:", svm.get_objective() debug_weights["after"] = [normalizer.get_beta(i) for i in range(num_subk)] # debugging output print "debug weights (before/after):" print debug_weights["before"] print debug_weights["after"] print "" # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (svm, data.name_to_id(task_name)) return svms
def _inner_train(self, prepared_data, param): """ perform inner training by processing the tree """ # init seq handler classifiers = [] ################# # mtk normalizer = MultitaskKernelNormalizer(prepared_data.task_vector_nums) from method_mhc_rbf import SequencesHandlerRbf task_kernel = SequencesHandlerRbf(1, param.base_similarity, prepared_data.get_task_names(), param.flags["wdk_rbf_on"]) # set similarity for task_name_lhs in prepared_data.get_task_names(): for task_name_rhs in prepared_data.get_task_names(): similarity = task_kernel.get_similarity(task_name_lhs, task_name_rhs) normalizer.set_task_similarity(prepared_data.name_to_id(task_name_lhs), prepared_data.name_to_id(task_name_rhs), similarity) lab = shogun_factory.create_labels(prepared_data.labels) print "creating empty kernel" kernel = shogun_factory.create_kernel(prepared_data.examples, param) print "setting normalizer" kernel.set_normalizer(normalizer) kernel.init_normalizer() svm = shogun_factory.create_svm(param, kernel, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # train SVM svm.train() classifiers.append(svm) ################# # dirac #import pdb #pdb.set_trace() svm_dirac = self._dirac_train(prepared_data, param) classifiers.append(svm_dirac) ## #union #svm_union = self._union_train(prepared_data, param) #classifiers.append(svm_union) return classifiers
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # support support = numpy.linspace(0, 1, 5) # set normalizer normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_nums) # fetch taxonomy from parameter object taxonomy = param.taxonomy.data taxonomy.plot() import os os.system("evince demo.png &") # compute distances distances = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for (i, task_name_lhs) in enumerate(data.get_task_names()): for (j, task_name_rhs) in enumerate(data.get_task_names()): distances[i, j] = task_similarities.compute_hop_distance( taxonomy, task_name_lhs, task_name_rhs) # normalize distances distances = distances / numpy.max(distances) # set distances for (i, task_name_lhs) in enumerate(data.get_task_names()): for (j, task_name_rhs) in enumerate(data.get_task_names()): normalizer.set_task_distance(i, j, distances[i, j]) # assign normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() svm = None debug_weights = {} num_subk = base_wdk.get_num_subkernels() print "num subkernels:", num_subk #print "subkernel weights:", base_wdk.get_subkernel_weights() debug_weights["before"] = [ normalizer.get_beta(i) for i in range(num_subk) ] print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: num_threads = 4 svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #GLPK) #DIRECT) #NEWTON)#ST_CPLEX) svm.set_C(param.cost, param.cost) svm.set_kernel(base_wdk) svm.set_labels(lab) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() #print "subkernel weights (after):", base_wdk.get_subkernel_weights() else: # create SVM (disable unsupported optimizations) svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "svm objective:", svm.get_objective() debug_weights["after"] = [ normalizer.get_beta(i) for i in range(num_subk) ] # debugging output print "debug weights (before/after):" print debug_weights["before"] print debug_weights["after"] print "" # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (svm, data.name_to_id(task_name)) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) kernel_matrix = base_wdk.get_kernel_matrix() lab = shogun_factory.create_labels(data.labels) # fetch taxonomy from parameter object taxonomy = param.taxonomy.data # create name to leaf map nodes = taxonomy.get_all_nodes() ######################################################## print "creating a kernel for each node:" ######################################################## # assemble combined kernel from shogun.Kernel import CombinedKernel, CustomKernel combined_kernel = CombinedKernel() # indicator to which task each example belongs task_vector = data.task_vector_names for node in nodes: print "creating kernel for ", node.name # fetch sub-tree leaf_names = [leaf.name for leaf in node.get_leaves()] print "masking all entries other than:", leaf_names # init matrix kernel_matrix_node = numpy.zeros(kernel_matrix.shape) # fill matrix for node for (i, task_lhs) in enumerate(task_vector): for (j, task_rhs) in enumerate(task_vector): # only copy values, if both tasks are present in subtree if task_lhs in leaf_names and task_rhs in leaf_names: kernel_matrix_node[i,j] = kernel_matrix[i,j] # create custom kernel kernel_node = CustomKernel() kernel_node.set_full_kernel_matrix_from_full(kernel_matrix_node) # append custom kernel to CombinedKernel combined_kernel.append_kernel(kernel_node) print "------" print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: num_threads = 4 svm = MKLClassification() svm.set_mkl_norm(param.transform) svm.set_solver_type(ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) svm.set_C(param.cost, param.cost) svm.set_kernel(combined_kernel) svm.set_labels(lab) svm.parallel.set_num_threads(num_threads) #svm.set_linadd_enabled(False) #svm.set_batch_computation_enabled(False) svm.train() print "subkernel weights (after):", combined_kernel.get_subkernel_weights() else: # create SVM (disable unsupported optimizations) svm = SVMLight(param.cost, combined_kernel, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() ######################################################## print "svm objective:" print svm.get_objective() ######################################################## # wrap up predictors svms = {} # use a reference to the same svm several times for task_id in train_data.keys(): svms[task_id] = svm return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ assert(param.base_similarity >= 1) # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # create normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) # load hard-coded task-similarity task_similarity = helper.load("/fml/ag-raetsch/home/cwidmer/svn/projects/alt_splice_code/src/task_sim_tis.bz2") # set similarity similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for (i, task_name_lhs) in enumerate(data.get_task_names()): #max_value_row = max(task_similarity.get_row(task_name_lhs)) max_value_row = 1.0 for (j, task_name_rhs) in enumerate(data.get_task_names()): similarity = task_similarity.get_value(task_name_lhs, task_name_rhs) / max_value_row normalizer.set_task_similarity(i, j, similarity) similarities[i,j] = similarity pprint.pprint similarities # set normalizer #print "WARNING MTK disabled!!!!!!!!!!!!!!!!!!!!!" base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm param.flags["svm_type"] = "svmlight" #fix svm type svm = shogun_factory.create_svm(param, base_wdk, lab) # make sure these parameters are set correctly #print "WARNING MTK WONT WORK WITH THESE SETTINGS!!!!!!!!!!!!!!!!!!!!!" svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) assert svm.get_linadd_enabled() == False, "linadd should be disabled" assert svm.get_batch_computation_enabled == False, "batch compute should be disabled" # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, svm) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ for task_id in train_data.keys(): print "task_id:", task_id root = param.taxonomy.data grey_nodes = [root] # top-down processing of taxonomy for node in root.get_leaves(): ##################################################### # train predictor ##################################################### parent_node = node.get_nearest_neighbor() cost = param.cost (examples, labels) = self.get_data(parent_node, train_data) # create shogun data objects k_parent = shogun_factory_new.create_kernel(examples, param) lab_parent = shogun_factory_new.create_labels(labels) parent_svm = SVMLight(cost, k_parent, lab_parent) parent_svm.train() ##################################################### # train predictors ##################################################### (examples, labels) = self.get_data(node, train_data) # create shogun data objects k = shogun_factory_new.create_kernel(examples, param) lab = shogun_factory_new.create_labels(labels) # regularize vs parent predictor weight = param.transform print "current edge_weight:", weight, " ,name:", node.name svm = DomainAdaptationSVM(cost, k, lab, parent_svm, weight) svm.train() # attach svm to node node.predictor = svm ##################################################### # Wrap things up ##################################################### # wrap up predictors for later use predictors = {} for leaf in root.get_leaves(): predictors[leaf.name] = leaf.predictor assert(leaf.predictor!=None) sym_diff_keys = set(train_data.keys()).symmetric_difference(set(predictors.keys())) assert len(sym_diff_keys)==0, "symmetric difference between keys non-empty: " + str(sym_diff_keys) return predictors
def test_data(): ################################################################## # select MSS ################################################################## mss = expenv.MultiSplitSet.get(379) ################################################################## # data ################################################################## # fetch data instance_set = mss.get_train_data(-1) # prepare data data = PreparedMultitaskData(instance_set, shuffle=True) # set parameters param = Options() param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = 4 param.cost = 1.0 param.transform = 1.0 param.id = 666 param.freeze() ################################################################## # taxonomy ################################################################## taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data) support = numpy.linspace(0, 100, 4) distances = [[0, 1, 2, 2], [1, 0, 2, 2], [2, 2, 0, 1], [2, 2, 1, 0]] # create tree normalizer tree_normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_names) task_names = data.get_task_names() FACTOR = 1.0 # init gamma matrix gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for t1_name in task_names: for t2_name in task_names: similarity = taxonomy.compute_node_similarity( taxonomy.get_id(t1_name), taxonomy.get_id(t2_name)) gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] = similarity helper.save("/tmp/gammas", gammas) gammas = gammas * FACTOR cost = param.cost * numpy.sqrt(FACTOR) print gammas ########## # regular normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) for t1_name in task_names: for t2_name in task_names: similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity) ################################################################## # Train SVMs ################################################################## # create shogun objects wdk_tree = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) wdk_tree.set_normalizer(tree_normalizer) wdk_tree.init_normalizer() print "--->", wdk_tree.get_normalizer().get_name() svm_tree = SVMLight(cost, wdk_tree, lab) svm_tree.set_linadd_enabled(False) svm_tree.set_batch_computation_enabled(False) svm_tree.train() del wdk_tree del tree_normalizer print "finished training tree-norm SVM:", svm_tree.get_objective() wdk = shogun_factory.create_kernel(data.examples, param) wdk.set_normalizer(normalizer) wdk.init_normalizer() print "--->", wdk.get_normalizer().get_name() svm = SVMLight(cost, wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "finished training manually set SVM:", svm.get_objective() alphas_tree = svm_tree.get_alphas() alphas = svm.get_alphas() assert (len(alphas_tree) == len(alphas)) for i in xrange(len(alphas)): assert (abs(alphas_tree[i] - alphas[i]) < 0.0001) print "success: all alphas are the same"
normalizer = MultitaskKernelNormalizer(data.task_vector_nums) for t1_name in task_names: for t2_name in task_names: similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity) ################################################################## # Train SVMs ################################################################## # create shogun objects wdk_tree = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) wdk_tree.set_normalizer(tree_normalizer) wdk_tree.init_normalizer() print "--->", wdk_tree.get_normalizer().get_name() svm_tree = SVMLight(cost, wdk_tree, lab) svm_tree.set_linadd_enabled(False) svm_tree.set_batch_computation_enabled(False) svm_tree.train() del wdk_tree del tree_normalizer
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # fetch taxonomy from parameter object taxonomy = shogun_factory.create_taxonomy(param.taxonomy.data) # set normalizer normalizer = MultitaskKernelTreeNormalizer(data.task_vector_names, data.task_vector_names, taxonomy) ######################################################## gammas = self.taxonomy_to_gammas(data, taxonomy) print "gammas before MKL:" print gammas ######################################################## base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() svm = None num_subk = base_wdk.get_num_subkernels() print "num subkernels:", num_subk #print "subkernel weights:", base_wdk.get_subkernel_weights() self.additional_information["weights_before"] = [normalizer.get_beta(i) for i in range(num_subk)] print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: num_threads = 4 svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #GLPK) #DIRECT) #NEWTON)#ST_CPLEX) svm.set_kernel(base_wdk) svm.set_labels(lab) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) if param.flags["normalize_cost"]: # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) else: svm.set_C(param.cost, param.cost) svm.train() #print "subkernel weights (after):", base_wdk.get_subkernel_weights() else: # create SVM (disable unsupported optimizations) svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "svm objective:", svm.get_objective() self.additional_information["weights"] = [normalizer.get_beta(i) for i in range(num_subk)] self.additional_information["gammas"] = self.taxonomy_to_gammas(data, taxonomy) print "debug weights:" print self.additional_information print "" # wrap up predictors svms = {} # use a reference to the same svm several times for task_id in train_data.keys(): svms[task_id] = svm return svms
def _inner_train(self, prepared_data, param): """ perform inner training by processing the tree """ # init seq handler classifiers = [] ################# # mtk normalizer = MultitaskKernelNormalizer(prepared_data.task_vector_nums) from method_mhc_rbf import SequencesHandlerRbf task_kernel = SequencesHandlerRbf(1, param.base_similarity, prepared_data.get_task_names(), param.flags["wdk_rbf_on"]) # set similarity for task_name_lhs in prepared_data.get_task_names(): for task_name_rhs in prepared_data.get_task_names(): similarity = task_kernel.get_similarity( task_name_lhs, task_name_rhs) normalizer.set_task_similarity( prepared_data.name_to_id(task_name_lhs), prepared_data.name_to_id(task_name_rhs), similarity) lab = shogun_factory.create_labels(prepared_data.labels) print "creating empty kernel" kernel = shogun_factory.create_kernel(prepared_data.examples, param) print "setting normalizer" kernel.set_normalizer(normalizer) kernel.init_normalizer() svm = shogun_factory.create_svm(param, kernel, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # train SVM svm.train() classifiers.append(svm) ################# # dirac #import pdb #pdb.set_trace() svm_dirac = self._dirac_train(prepared_data, param) classifiers.append(svm_dirac) ## #union #svm_union = self._union_train(prepared_data, param) #classifiers.append(svm_union) return classifiers
def __init__(self, degree, sigma, active_set, wdk_rbf_on): ''' loads data into handler ''' self.active_set = active_set fn = "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHCsequenzen/pseudo.txt" tmp_key = "" tmp_idx = 0 self.seqs = [] self.keys = [] self.name_to_id = {} # parse file for line in file(fn): if line.startswith(">"): tmp_key = line.strip()[1:] else: if active_set.count(tmp_key) > 0: assert self.keys.count( tmp_key ) == 0, "key %s is already contained in self.keys" % ( tmp_key) self.seqs.append(line.strip()) self.keys.append(tmp_key) self.name_to_id[tmp_key] = tmp_idx tmp_idx += 1 assert len( self.seqs ) == tmp_idx, "incorrect number of sequences %i != %i" % ( len(self.seqs), tmp_idx) assert len( self.keys ) == tmp_idx, "incorrect number of keys %i != %i" % (len( self.keys), tmp_idx) # setup kernel param = Options() if wdk_rbf_on: param.kernel = "WeightedDegreeRBFKernel" else: param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = degree param.transform = sigma self.kernel = shogun_factory.create_kernel(self.seqs, param) ####################### # compute kernel ####################### num_tasks = len(self.seqs) self.similarity = numpy.zeros((num_tasks, num_tasks)) for i in xrange(num_tasks): for j in xrange(num_tasks): self.similarity[i, j] = self.kernel.kernel(i, j) # normalize kernel my_min = numpy.min(self.similarity) my_max = numpy.max(self.similarity) my_diff = my_max - my_min # scale to interval [0,1] #self.similarity = (self.similarity - my_min) / my_diff self.similarity = (self.similarity) / my_max print self.similarity
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # fetch taxonomy from parameter object taxonomy = shogun_factory.create_taxonomy(param.taxonomy.data) # set normalizer normalizer = MultitaskKernelTreeNormalizer(data.task_vector_names, data.task_vector_names, taxonomy) ######################################################## gammas = self.taxonomy_to_gammas(data, taxonomy) print "gammas before MKL:" print gammas ######################################################## base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() svm = None num_subk = base_wdk.get_num_subkernels() print "num subkernels:", num_subk #print "subkernel weights:", base_wdk.get_subkernel_weights() self.additional_information["weights_before"] = [ normalizer.get_beta(i) for i in range(num_subk) ] print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: num_threads = 4 svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #GLPK) #DIRECT) #NEWTON)#ST_CPLEX) svm.set_kernel(base_wdk) svm.set_labels(lab) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) if param.flags["normalize_cost"]: # normalize cost norm_c_pos = param.cost / float( len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) else: svm.set_C(param.cost, param.cost) svm.train() #print "subkernel weights (after):", base_wdk.get_subkernel_weights() else: # create SVM (disable unsupported optimizations) svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "svm objective:", svm.get_objective() self.additional_information["weights"] = [ normalizer.get_beta(i) for i in range(num_subk) ] self.additional_information["gammas"] = self.taxonomy_to_gammas( data, taxonomy) print "debug weights:" print self.additional_information print "" # wrap up predictors svms = {} # use a reference to the same svm several times for task_id in train_data.keys(): svms[task_id] = svm return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) ######################################################## print "creating a kernel for each node:" ######################################################## # init seq handler task_kernel = SequencesHandlerRbf(1, param.base_similarity, data.get_task_names(), param.flags["wdk_rbf_on"]) similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = task_kernel.get_similarity(task_name_lhs, task_name_rhs) print similarity print "similarity (%s,%s)=%f" % (task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs),data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, param, svm) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ for task_id in train_data.keys(): print "task_id:", task_id root = param.taxonomy.data grey_nodes = [root] # top-down processing of taxonomy while len(grey_nodes) > 0: node = grey_nodes.pop(0) # pop first item # enqueue children if node.children != None: grey_nodes.extend(node.children) ##################################################### # init data structures ##################################################### # get data below current node data = [train_data[key] for key in node.get_data_keys()] print "data at current level" for instance_set in data: print instance_set[0].dataset # initialize containers examples = [] labels = [] # concatenate data for instance_set in data: print "train split_set:", instance_set[0].dataset.organism for inst in instance_set: examples.append(inst.example) labels.append(inst.label) # create shogun data objects k = shogun_factory_new.create_kernel(examples, param) lab = shogun_factory_new.create_labels(labels) cost = param.cost #cost = node.cost print "using cost:", cost ##################################################### # train predictors ##################################################### # init predictor variable svm = None # set up SVM if node.is_root(): print "training svm at top level" svm = SVMLight(cost, k, lab) else: # regularize vs parent predictor #weight = node.edge_weight weight = param.transform print "current edge_weight:", weight, " ,name:", node.name parent_svm = node.parent.predictor svm = DomainAdaptationSVM(cost, k, lab, parent_svm, weight) #svm.set_train_factor(param.base_similarity) if param.flags["normalize_cost"]: norm_c_pos = param.cost / float( len([l for l in lab.get_labels() if l == 1])) norm_c_neg = param.cost / float( len([l for l in lab.get_labels() if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) # set epsilon if param.flags.has_key("epsilon"): svm.set_epsilon(param.flags["epsilon"]) # enable output svm.io.enable_progress() svm.io.set_loglevel(shogun.Classifier.MSG_INFO) svm.set_train_factor(param.flags["train_factor"]) svm.train() # attach svm to node node.predictor = svm # save some information self.additional_information[node.name + " svm obj"] = svm.get_objective() self.additional_information[ node.name + " svm num sv"] = svm.get_num_support_vectors() self.additional_information[node.name + " runtime"] = svm.get_runtime() ##################################################### # Wrap things up ##################################################### # wrap up predictors for later use predictors = {} for leaf in root.get_leaves(): predictors[leaf.name] = leaf.predictor assert (leaf.predictor != None) # make sure we have the same keys (potentiall in a different order) sym_diff_keys = set(train_data.keys()).symmetric_difference( set(predictors.keys())) assert len( sym_diff_keys ) == 0, "symmetric difference between keys non-empty: " + str( sym_diff_keys) # save graph plot mypath = "/fml/ag-raetsch/share/projects/multitask/graphs/" filename = mypath + "graph_" + str(param.id) root.plot(filename) #, plot_cost=True, plot_B=True) return predictors
def __init__(self, degree, sigma, active_set, wdk_rbf_on): ''' loads data into handler ''' self.active_set = active_set fn = "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHCsequenzen/pseudo.txt" tmp_key = "" tmp_idx = 0 self.seqs = [] self.keys = [] self.name_to_id = {} # parse file for line in file(fn): if line.startswith(">"): tmp_key = line.strip()[1:] else: if active_set.count(tmp_key) > 0: assert self.keys.count(tmp_key) == 0, "key %s is already contained in self.keys" % (tmp_key) self.seqs.append(line.strip()) self.keys.append(tmp_key) self.name_to_id[tmp_key] = tmp_idx tmp_idx += 1 assert len(self.seqs) == tmp_idx, "incorrect number of sequences %i != %i" % (len(self.seqs), tmp_idx) assert len(self.keys) == tmp_idx, "incorrect number of keys %i != %i" % (len(self.keys), tmp_idx) # setup kernel param = Options() if wdk_rbf_on: param.kernel = "WeightedDegreeRBFKernel" else: param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = degree param.transform = sigma self.kernel = shogun_factory.create_kernel(self.seqs, param) ####################### # compute kernel ####################### num_tasks = len(self.seqs) self.similarity = numpy.zeros((num_tasks, num_tasks)) for i in xrange(num_tasks): for j in xrange(num_tasks): self.similarity[i,j] = self.kernel.kernel(i, j) # normalize kernel my_min = numpy.min(self.similarity) my_max = numpy.max(self.similarity) my_diff = my_max - my_min # scale to interval [0,1] #self.similarity = (self.similarity - my_min) / my_diff self.similarity = (self.similarity) / my_max print self.similarity
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # fix dimensions M = len(train_data) N = 0 for key in train_data.keys(): N += len(train_data[key]) # init containers examples = [] labels = [] # vector to indicate to which task each example belongs task_vector = [] task_num = 0 tmp_examples = 0 label_matrix = numpy.zeros((M, N)) # extract training data for (task_id, instance_set) in train_data.items(): print "train task id:", task_id #assert(instance_set[0].dataset.organism==task_id) examples.extend([inst.example for inst in instance_set]) tmp_labels = [inst.label for inst in instance_set] labels.extend(tmp_labels) begin_idx = tmp_examples end_idx = tmp_examples + len(tmp_labels) # fill matrix row label_matrix[task_num, begin_idx:end_idx] = tmp_labels task_vector.extend([task_num] * len(instance_set)) task_num += 1 tmp_examples += len(tmp_labels) # fetch gammas from parameter object # TODO: compute gammas outside of this gammas = numpy.ones((M, M)) + numpy.eye(M) #gammas = numpy.eye(M) # create kernel kernel = shogun_factory.create_kernel(examples, param) y = numpy.array(labels) print "computing kernel matrix" km = kernel.get_kernel_matrix() km = reweight_kernel_matrix(km, gammas, task_vector) # "add" labels to Q-matrix km = numpy.transpose(y.flatten() * (km * y.flatten()).transpose()) print "done computing kernel matrix, calling solver" f = -numpy.ones(N) b = numpy.zeros((M, 1)) # set up QP p = QP(km, f, Aeq=label_matrix, beq=b, lb=numpy.zeros(N), ub=param.cost * numpy.ones(N)) p.debug = 1 # run solver r = p.solve('cvxopt_qp', iprint=0) print "done with training" alphas = r.xf objective = r.ff print "alphas:", alphas predictors = {} for (k, task_id) in enumerate(train_data.keys()): # pack all relevant information in predictor predictors[task_id] = (alphas, param, task_vector, k, gammas, examples, labels) return predictors
def _inner_train(self, train_data, param): """ perform inner training by processing the tree """ data_keys = [] # top-down processing of taxonomy classifiers = [] classifier_at_node = {} root = param.taxonomy.data grey_nodes = [root] while len(grey_nodes)>0: node = grey_nodes.pop(0) # pop first item # enqueue children if node.children != None: grey_nodes.extend(node.children) ##################################################### # init data structures ##################################################### # get data below current node data = [train_data[key] for key in node.get_data_keys()] data_keys.append(node.get_data_keys()) print "data at current level" for instance_set in data: print instance_set[0].dataset # initialize containers examples = [] labels = [] # concatenate data for instance_set in data: print "train split_set:", instance_set[0].dataset.organism for inst in instance_set: examples.append(inst.example) labels.append(inst.label) # create shogun data objects k = shogun_factory.create_kernel(examples, param) lab = shogun_factory.create_labels(labels) ##################################################### # train weak learners ##################################################### cost = param.cost # set up svm svm = SVMLight(cost, k, lab) if param.flags["normalize_cost"]: # set class-specific Cs norm_c_pos = param.cost / float(len([l for l in labels if l==1])) norm_c_neg = param.cost / float(len([l for l in labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) print "using cost: negative class=%f, positive class=%f" % (norm_c_neg, norm_c_pos) # enable output svm.io.enable_progress() svm.io.set_loglevel(shogun.Classifier.MSG_INFO) # train svm.train() # append svm object classifiers.append(svm) classifier_at_node[node.name] = svm # save some information self.additional_information[node.name + " svm obj"] = svm.get_objective() self.additional_information[node.name + " svm num sv"] = svm.get_num_support_vectors() self.additional_information[node.name + " runtime"] = svm.get_runtime() return (classifiers, classifier_at_node)
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # fix dimensions M = len(train_data) N = 0 for key in train_data.keys(): N += len(train_data[key]) # init containers examples = [] labels = [] # vector to indicate to which task each example belongs task_vector = [] task_num = 0 tmp_examples = 0 label_matrix = numpy.zeros((M,N)) # extract training data for (task_id, instance_set) in train_data.items(): print "train task id:", task_id #assert(instance_set[0].dataset.organism==task_id) examples.extend([inst.example for inst in instance_set]) tmp_labels = [inst.label for inst in instance_set] labels.extend(tmp_labels) begin_idx = tmp_examples end_idx = tmp_examples + len(tmp_labels) # fill matrix row label_matrix[task_num, begin_idx:end_idx] = tmp_labels task_vector.extend([task_num]*len(instance_set)) task_num += 1 tmp_examples += len(tmp_labels) # fetch gammas from parameter object # TODO: compute gammas outside of this gammas = numpy.ones((M,M)) + numpy.eye(M) #gammas = numpy.eye(M) # create kernel kernel = shogun_factory.create_kernel(examples, param) y = numpy.array(labels) print "computing kernel matrix" km = kernel.get_kernel_matrix() km = reweight_kernel_matrix(km, gammas, task_vector) # "add" labels to Q-matrix km = numpy.transpose(y.flatten() * (km*y.flatten()).transpose()) print "done computing kernel matrix, calling solver" f = -numpy.ones(N) b = numpy.zeros((M,1)) # set up QP p = QP(km, f, Aeq=label_matrix, beq=b, lb=numpy.zeros(N), ub=param.cost*numpy.ones(N)) p.debug=1 # run solver r = p.solve('cvxopt_qp', iprint = 0) print "done with training" alphas = r.xf objective = r.ff print "alphas:", alphas predictors = {} for (k, task_id) in enumerate(train_data.keys()): # pack all relevant information in predictor predictors[task_id] = (alphas, param, task_vector, k, gammas, examples, labels) return predictors
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ assert (param.base_similarity >= 1) # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) # load data #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt") f = file( "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt" ) #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt") #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt") num_lines = int(f.readline().strip()) task_distances = numpy.zeros((num_lines, num_lines)) name_to_id = {} for (i, line) in enumerate(f): tokens = line.strip().split("\t") name = str(tokens[0]) name_to_id[name] = i entry = numpy.array([v for (j, v) in enumerate(tokens) if j != 0]) assert len(entry) == num_lines, "len_entry %i, num_lines %i" % ( len(entry), num_lines) task_distances[i, :] = entry # cut relevant submatrix active_ids = [name_to_id[name] for name in data.get_task_names()] tmp_distances = task_distances[active_ids, :] tmp_distances = tmp_distances[:, active_ids] print "distances ", tmp_distances.shape # normalize distances task_distances = task_distances / numpy.max(tmp_distances) similarities = numpy.zeros( (data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = param.base_similarity - task_distances[ name_to_id[task_name_lhs], name_to_id[task_name_rhs]] normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, param, svm) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # split for training weak_learners and boosting (train_weak, train_boosting) = split_data(train_data, 4) # merge data sets data = PreparedMultitaskData(train_weak, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) ################################################## # define pockets ################################################## pockets = [0]*9 pockets[0] = [1, 5, 6, 7, 8, 31, 32, 33, 34] pockets[1] = [1, 2, 3, 4, 6, 7, 8, 9, 11, 21, 31] pockets[2] = [11, 20, 21, 22, 29, 31] pockets[3] = [8, 30, 31, 32] pockets[4] = [10, 11, 30] pockets[5] = [10, 11, 12, 13, 20, 29] pockets[6] = [10, 12, 20, 22, 26, 27, 28, 29] pockets[7] = [12, 14, 15, 26] pockets[8] = [13, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26] pockets = [] for i in xrange(35): pockets.append([i]) #new_pockets = [] # merge neighboring pockets #for i in range(8): # new_pockets.append(list(set(pockets[i]).union(set(pockets[i+1])))) #pockets = new_pockets ######################################################## print "creating a kernel:" ######################################################## # init seq handler pseudoseqs = SequencesHandler() classifiers = [] for pocket in pockets: print "creating normalizer" #import pdb #pdb.set_trace() normalizer = MultitaskKernelNormalizer(data.task_vector_nums) print "processing pocket", pocket # set similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): similarity = 0.0 for pseudo_seq_pos in pocket: similarity += float(pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos-1)) # normalize similarity = similarity / float(len(pocket)) print "pocket %s (%s, %s) = %f" % (str(pocket), task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) print "creating empty kernel" kernel = shogun_factory.create_kernel(data.examples, param) print "setting normalizer" kernel.set_normalizer(normalizer) print "training SVM for pocket", pocket svm = self._train_single_svm(param, kernel, lab) classifiers.append(svm) print "done obtaining weak learners" # save additional info #self.additional_information["svm_objective"] = svm.get_objective() #self.additional_information["svm num sv"] = svm.get_num_support_vectors() #self.additional_information["post_weights"] = combined_kernel.get_subkernel_weights() #print self.additional_information ################################################## # combine weak learners for each task ################################################## # set constants some = 0.9 import cvxmod # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_boosting.keys(): instances = train_boosting[task_name] N = len(instances) F = len(pockets) examples = [inst.example for inst in instances] labels = [inst.label for inst in instances] # dim = (F x N) out = cvxmod.zeros((N,F)) for i in xrange(F): svm = classifiers[i] tmp_out = self._predict_weak(svm, examples, data.name_to_id(task_name)) out[:,i] = numpy.sign(tmp_out) #out[:,i] = tmp_out #TODO: fix helper.save("/tmp/out_sparse", (out,labels)) pdb.set_trace() weights = solve_boosting(out, labels, some, solver="mosek") svms[task_name] = (data.name_to_id(task_name), svm) return svms
def _inner_train(self, prepared_data, param): """ perform inner training by processing the tree """ # init seq handler pseudoseqs = SequencesHandler() classifiers = [] for pocket in self.get_pockets(param.flags["all_positions"]): print "creating normalizer" #import pdb #pdb.set_trace() normalizer = MultitaskKernelNormalizer(prepared_data.task_vector_nums) from method_mhc_rbf import SequencesHandlerRbf task_kernel = SequencesHandlerRbf(1, param.base_similarity, prepared_data.get_task_names(), param.flags["wdk_rbf_on"]) print "processing pocket", pocket M = prepared_data.get_num_tasks() save_sim_p = numpy.zeros((M,M)) save_sim_t = numpy.zeros((M,M)) # set similarity for task_name_lhs in prepared_data.get_task_names(): for task_name_rhs in prepared_data.get_task_names(): similarity = 0.0 for pseudo_seq_pos in pocket: similarity += float(pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos-1)) # normalize similarity = similarity / float(len(pocket)) similarity_task = task_kernel.get_similarity(task_name_lhs, task_name_rhs) print "pocket %s (%s, %s) = %f" % (str(pocket), task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(prepared_data.name_to_id(task_name_lhs), prepared_data.name_to_id(task_name_rhs), similarity) save_sim_p[prepared_data.name_to_id(task_name_lhs), prepared_data.name_to_id(task_name_rhs)] = similarity save_sim_t[prepared_data.name_to_id(task_name_lhs), prepared_data.name_to_id(task_name_rhs)] = similarity_task #from IPython.Shell import IPShellEmbed #IPShellEmbed([])() lab = shogun_factory.create_labels(prepared_data.labels) print "creating empty kernel" kernel = shogun_factory.create_kernel(prepared_data.examples, param) print "setting normalizer" kernel.set_normalizer(normalizer) kernel.init_normalizer() print "training SVM for pocket", pocket svm = shogun_factory.create_svm(param, kernel, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # train SVM svm.train() #import pdb #pdb.set_trace() classifiers.append(svm) return classifiers
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ for task_id in train_data.keys(): print "task_id:", task_id root = param.taxonomy.data grey_nodes = [root] # top-down processing of taxonomy while len(grey_nodes)>0: node = grey_nodes.pop(0) # pop first item # enqueue children if node.children != None: grey_nodes.extend(node.children) ##################################################### # init data structures ##################################################### # get data below current node data = [train_data[key] for key in node.get_data_keys()] print "data at current level" for instance_set in data: print instance_set[0].dataset # initialize containers examples = [] labels = [] # concatenate data for instance_set in data: print "train split_set:", instance_set[0].dataset.organism for inst in instance_set: examples.append(inst.example) labels.append(inst.label) # create shogun data objects k = shogun_factory_new.create_kernel(examples, param) lab = shogun_factory_new.create_labels(labels) cost = param.cost #cost = node.cost print "using cost:", cost ##################################################### # train predictors ##################################################### # init predictor variable svm = None # set up SVM if node.is_root(): print "training svm at top level" svm = SVMLight(cost, k, lab) else: # regularize vs parent predictor #weight = node.edge_weight weight = param.transform print "current edge_weight:", weight, " ,name:", node.name parent_svm = node.parent.predictor svm = DomainAdaptationSVM(cost, k, lab, parent_svm, weight) #svm.set_train_factor(param.base_similarity) if param.flags["normalize_cost"]: norm_c_pos = param.cost / float(len([l for l in lab.get_labels() if l==1])) norm_c_neg = param.cost / float(len([l for l in lab.get_labels() if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) # set epsilon if param.flags.has_key("epsilon"): svm.set_epsilon(param.flags["epsilon"]) # enable output svm.io.enable_progress() svm.io.set_loglevel(shogun.Classifier.MSG_INFO) svm.set_train_factor(param.flags["train_factor"]) svm.train() # attach svm to node node.predictor = svm # save some information self.additional_information[node.name + " svm obj"] = svm.get_objective() self.additional_information[node.name + " svm num sv"] = svm.get_num_support_vectors() self.additional_information[node.name + " runtime"] = svm.get_runtime() ##################################################### # Wrap things up ##################################################### # wrap up predictors for later use predictors = {} for leaf in root.get_leaves(): predictors[leaf.name] = leaf.predictor assert(leaf.predictor!=None) # make sure we have the same keys (potentiall in a different order) sym_diff_keys = set(train_data.keys()).symmetric_difference(set(predictors.keys())) assert len(sym_diff_keys)==0, "symmetric difference between keys non-empty: " + str(sym_diff_keys) # save graph plot mypath = "/fml/ag-raetsch/share/projects/multitask/graphs/" filename = mypath + "graph_" + str(param.id) root.plot(filename)#, plot_cost=True, plot_B=True) return predictors
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) ######################################################## print "creating a kernel for each node:" ######################################################## # init seq handler task_kernel = SequencesHandlerRbf(1, param.base_similarity, data.get_task_names(), param.flags["wdk_rbf_on"]) similarities = numpy.zeros( (data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = task_kernel.get_similarity( task_name_lhs, task_name_rhs) print similarity print "similarity (%s,%s)=%f" % (task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, param, svm) return svms
normalizer = MultitaskKernelNormalizer(data.task_vector_nums) for t1_name in task_names: for t2_name in task_names: similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity) ################################################################## # Train SVMs ################################################################## # create shogun objects wdk_tree = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) wdk_tree.set_normalizer(tree_normalizer) wdk_tree.init_normalizer() print "--->",wdk_tree.get_normalizer().get_name() svm_tree = SVMLight(cost, wdk_tree, lab) svm_tree.set_linadd_enabled(False) svm_tree.set_batch_computation_enabled(False) svm_tree.train() del wdk_tree del tree_normalizer
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # split for training weak_learners and boosting (train_weak, train_boosting) = split_data(train_data, 4) # merge data sets data = PreparedMultitaskData(train_weak, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) ######################################################## print "creating a kernel:" ######################################################## # init seq handler pseudoseqs = SequencesHandler() classifiers = [] for pocket in pockets: print "creating normalizer" #import pdb #pdb.set_trace() normalizer = MultitaskKernelNormalizer(data.task_vector_nums) print "processing pocket", pocket # set similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): similarity = 0.0 for pseudo_seq_pos in pocket: similarity += float(pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos-1)) # normalize similarity = similarity / float(len(pocket)) print "pocket %s (%s, %s) = %f" % (str(pocket), task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) print "creating empty kernel" kernel = shogun_factory.create_kernel(data.examples, param) print "setting normalizer" kernel.set_normalizer(normalizer) print "training SVM for pocket", pocket svm = self._train_single_svm(param, kernel, lab) classifiers.append(svm) print "done obtaining weak learners" # save additional info #self.additional_information["svm_objective"] = svm.get_objective() #self.additional_information["svm num sv"] = svm.get_num_support_vectors() #self.additional_information["post_weights"] = combined_kernel.get_subkernel_weights() #print self.additional_information ################################################## # combine weak learners for each task ################################################## # set constants some = 0.9 import cvxmod # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_boosting.keys(): instances = train_boosting[task_name] N = len(instances) F = len(pockets) examples = [inst.example for inst in instances] labels = [inst.label for inst in instances] # dim = (F x N) out = cvxmod.zeros((N,F)) for i in xrange(F): svm = classifiers[i] tmp_out = self._predict_weak(svm, examples, data.name_to_id(task_name)) out[:,i] = numpy.sign(tmp_out) #out[:,i] = tmp_out #TODO: fix helper.save("/tmp/out_sparse", (out,labels)) pdb.set_trace() weights = solve_boosting(out, labels, some, solver="mosek") svms[task_name] = (data.name_to_id(task_name), svm) return svms