def load_confounders(self,confounder_filename): nfileio = FileIO() return nfileio.load_metadata(confounder_filename)
def crossvalidate(self): root_output = self.root_output if self.outputFilename != None: rvf_curDate = time.strftime("%Y-%m-%d-%H-%M-%S") outputPath = self.outputFilename+"-Files" if not os.path.exists(outputPath): os.makedirs(outputPath) for replicate in xrange(self.r): if MPI_PARALLEL: pass else: replicate_plus_one = replicate+1 print "Starting replicate %d"%(replicate_plus_one) self.replicates.append([]) self._randomize_sample_set() partitions = self._split_sample_set(self.v) for i in xrange(self.v): self.replicates[replicate].append([]) training_set, test_set = self._construct_training_and_testing_sets(partitions,i) print "Fold %d: training_set: %d, test set: %d"%(i,len(training_set),len(test_set)) if ( self.target_class != None ): fileio = FileIO() trainingSetFile = outputPath+"/"+str(self.target_class)+"_R"+str(replicate)+"_F"+str(i)+"_training.set" testSetFile = outputPath+"/"+str(self.target_class)+"_R"+str(replicate)+"_F"+str(i)+"_test.set" print "Saving training set to: "+trainingSetFile fileio.save_samples(training_set,trainingSetFile) print "Saving test set to: "+testSetFile fileio.save_samples(test_set,testSetFile) for test_configuration in self.test_configurations: test_name = test_configuration.name new_training_set = training_set new_test_set = test_set if test_configuration.feature_selector: features = test_configuration.feature_selector.select(training_set) new_training_set = training_set.feature_select(features) new_test_set = test_set.feature_select(features) model = test_configuration.trainer.train(new_training_set) if self.outputFilename != None: if not hasattr(model, 'write'): # i.e. probably SVM model svmModelFile = outputPath+"/"+str(self.target_class)+"_R"+str(replicate)+"_F"+str(i)+"_svm.model" model['svm_model'].save(filename=svmModelFile) with open(svmModelFile+".classlabelmap",'a') as outfile: pickle.dump(model["class_label_map"],outfile) #fails with model, because of SWIGpy object with open(svmModelFile+".classlabelmapindex",'a') as outfile: pickle.dump(model["class_label_map_index"],outfile) with open(svmModelFile+".featuremapindex",'w') as outfile: pickle.dump(new_training_set.get_index_to_feature(), outfile) results = test_configuration.classifier.test(new_test_set,model) self.replicates[replicate][i].append(results) #order of results same as order of configurations if ( self.target_class != None ): print results.print_classification_log() print results if root_output: fout = open("%(root_output)s.r%(replicate_plus_one)d.v%(i)d.%(test_name)s.features"%(locals()),"w") fout.write("\n".join(features)) fout.close() if root_output: fout = open("%(root_output)s.r%(replicate_plus_one)d.classification.log"%(locals()),"w") header_fields = ["sample","fold",self.sample_set.current_class] for test_configuration in self.test_configurations: header_fields.append(test_configuration.name) output_dictionary = {} output_lines = ["\t".join(header_fields)] for fold in xrange(self.v): for classification_index in xrange(len(self.replicates[replicate][fold][0].classifications_list)): main_sample_record = self.replicates[replicate][fold][0].classifications_list[classification_index] output_line = [str(main_sample_record.who),str(fold+1),str(main_sample_record.true_class)] for test_configuration_index in xrange(len(self.test_configurations)): test_sample_record = self.replicates[replicate][fold][test_configuration_index].classifications_list[classification_index] output_line.append(str(test_sample_record.predicted_class)) output_lines.append("\t".join(output_line)) fout.write("\n".join(output_lines)) fout.close() print "Finished replicate %d"%(replicate_plus_one)
def crossvalidate(self): root_output = self.root_output if self.outputFilename != None: rvf_curDate = time.strftime("%Y-%m-%d-%H-%M-%S") outputPath = self.outputFilename + "-Files" if not os.path.exists(outputPath): os.makedirs(outputPath) for replicate in xrange(self.r): if MPI_PARALLEL: pass else: replicate_plus_one = replicate + 1 print "Starting replicate %d" % (replicate_plus_one) self.replicates.append([]) self._randomize_sample_set() self.unmodified = self.unmodified._sort_by_sample_set( self.sample_set) partitions = self._split_sample_set(self.v) for i in xrange(self.v): self.replicates[replicate].append([]) training_set, test_set = self._construct_training_and_testing_sets( partitions, i) print "Fold %d: training_set: %d, test set: %d" % ( i, len(training_set), len(test_set)) if (self.target_class != None): fileio = FileIO() trainingSetFile = outputPath + "/" + str( self.target_class) + "_R" + str( replicate) + "_F" + str(i) + "_training.set" testSetFile = outputPath + "/" + str( self.target_class) + "_R" + str( replicate) + "_F" + str(i) + "_test.set" print "Saving training set to: " + trainingSetFile fileio.save_samples(training_set, trainingSetFile) print "Saving test set to: " + testSetFile fileio.save_samples(test_set, testSetFile) for test_configuration in self.test_configurations: test_name = test_configuration.name new_training_set = training_set new_test_set = test_set if test_configuration.feature_selector: features = test_configuration.feature_selector.select( training_set) new_training_set = training_set.feature_select( features) new_test_set = test_set.feature_select(features) model = test_configuration.trainer.train( new_training_set) if self.outputFilename != None: if not hasattr(model, 'write'): # i.e. probably SVM model svmModelFile = outputPath + "/" + str( self.target_class) + "_R" + str( replicate) + "_F" + str( i) + "_svm.model" model['svm_model'].save(filename=svmModelFile) with open(svmModelFile + ".classlabelmap", 'a') as outfile: pickle.dump( model["class_label_map"], outfile ) #fails with model, because of SWIGpy object with open(svmModelFile + ".classlabelmapindex", 'a') as outfile: pickle.dump(model["class_label_map_index"], outfile) with open(svmModelFile + ".featuremapindex", 'w') as outfile: pickle.dump( new_training_set.get_index_to_feature( ), outfile) ##################################################################################### # add here contamination&completeness ##################################################################################### all_class_labels = new_test_set.get_class_labels() sample_attribute_collection = {} for index in all_class_labels: sample_attribute_collection[index] = [] for sample in new_test_set.__iter__(): temp_attributes_list = list( sample.get_attributes_index_list()) sample_attribute_collection[ sample.current_class_label].append( temp_attributes_list) for w in range(0, len(self.completeness)): self.replicates[replicate][i].append([]) incomplete_test_set = new_test_set.induce_incompleteness( self.completeness[w]) if len(sample_attribute_collection.keys()) != 2: print(sample_attribute_collection.keys()) sys.stderr.write( "Warning: skipping contamination of Fold %i in replicate %i: need exactly 2 different class labels\n" % (i, replicate)) for z in range(0, len(self.contamination)): self.replicates[replicate][i][w].append([]) continue for z in range(0, len(self.contamination)): self.replicates[replicate][i][w].append([]) #print(completeness,contamination) contaminated_test_set = incomplete_test_set.introduce_contamination( sample_attribute_collection, self.contamination[z]) contaminated_test_set = contaminated_test_set.map_test_set_attributes_to_training_set( new_training_set) #print(dir(model)) results = test_configuration.classifier.test( contaminated_test_set, model) self.replicates[replicate][i][w][z].append( results ) #order of results same as order of configurations if (self.target_class != None): print results.print_classification_log() print results if root_output: fout = open( "%(root_output)s.r%(replicate_plus_one)d.v%(i)d.%(test_name)s.features" % (locals()), "w") fout.write("\n".join(features)) fout.close() if root_output: fout = open( "%(root_output)s.r%(replicate_plus_one)d.classification.log" % (locals()), "w") header_fields = [ "sample", "fold", self.sample_set.current_class ] for test_configuration in self.test_configurations: header_fields.append(test_configuration.name) output_dictionary = {} output_lines = ["\t".join(header_fields)] for fold in xrange(self.v): for classification_index in xrange( len(self.replicates[replicate][fold] [0].classifications_list)): main_sample_record = self.replicates[replicate][ fold][0].classifications_list[ classification_index] output_line = [ str(main_sample_record.who), str(fold + 1), str(main_sample_record.true_class) ] for test_configuration_index in xrange( len(self.test_configurations)): test_sample_record = self.replicates[ replicate][fold][ test_configuration_index].classifications_list[ classification_index] output_line.append( str(test_sample_record.predicted_class)) output_lines.append("\t".join(output_line)) fout.write("\n".join(output_lines)) fout.close() print "Finished replicate %d" % (replicate_plus_one)
error("Please provide a genotype sample file with -s /path/to/genotype.file") errorCount += 1 if not options.input_classes_filename: error("Please provide a phenotype class file with -c /path/to/phenotype.file") errorCount += 1 if not options.target_class: error("Please provide the phenotype target to be predicted with -t \"TRAITNAME\"") errorCount += 1 if not options.output_filename: error("Please specify a file for the output with -o /path/to/result.file") errorCount += 1 if errorCount > 0: error("For help on usage, try calling:\n\tpython %s -h" % os.path.basename(sys.argv[0])) exit(1) fileio = FileIO() samples = fileio.load_samples(options.input_samples_filename) if options.feature_select: print "Selecting top %d features from %s, ordered by %s"%(options.feature_select_top_n,options.feature_select,options.feature_select_score) from pica.AssociationRule import load_rules,AssociationRuleSet selected_rules = AssociationRuleSet() rules = load_rules(options.feature_select) rules.set_target_accuracy(options.feature_select_score) selected_rules.extend(rules[:options.feature_select_top_n]) samples = samples.feature_select(selected_rules) classes = fileio.load_classes(options.input_classes_filename) samples.load_class_labels(classes) print "Sample set has %d features."%(samples.get_number_of_features()) samples.set_current_class(options.target_class) print "Parameters from %s"%(options.parameters) print "Compressing features...",
def replicateProcess(parametertuple): training_set, test_set, target_class, test_configurations, outputFilename, completeness, contamination, root_output, replicate, fold = parametertuple w_tot = len(completeness) z_tot = len(contamination) output=[] replicate_plus_one=replicate+1 print "Fold %d: training_set: %d, test set: %d"%(fold,len(training_set),len(test_set)) if ( target_class != None ): fileio = FileIO() trainingSetFile = outputPath+"/"+str(target_class)+"_R"+str(replicate)+"_F"+str(fold)+"_training.set" testSetFile = outputPath+"/"+str(target_class)+"_R"+str(replicate)+"_F"+str(fold)+"_test.set" print "Saving training set to: "+trainingSetFile fileio.save_samples(training_set,trainingSetFile) print "Saving test set to: "+testSetFile fileio.save_samples(test_set,testSetFile) for test_configuration_index in xrange(len(test_configurations)): test_configuration=test_configurations[test_configuration_index] test_name = test_configuration.name new_training_set = training_set new_test_set = test_set if test_configuration.feature_selector: features = test_configuration.feature_selector.select(training_set) new_training_set = training_set.feature_select(features) new_test_set = test_set.feature_select(features) model = test_configuration.trainer.train(new_training_set) if outputFilename != None: if not hasattr(model, 'write'): # i.e. probably SVM model svmModelFile = outputPath+"/"+str(target_class)+"_R"+str(replicate)+"_F"+str(fold)+"_svm.model" model['svm_model'].save(filename=svmModelFile) with open(svmModelFile+".classlabelmap",'a') as outfile: pickle.dump(model["class_label_map"],outfile) #fails with model, because of SWIGpy object with open(svmModelFile+".classlabelmapindex",'a') as outfile: pickle.dump(model["class_label_map_index"],outfile) with open(svmModelFile+".featuremapindex",'w') as outfile: pickle.dump(new_training_set.get_index_to_feature(), outfile) all_class_labels=new_test_set.get_class_labels() sample_attribute_collection={} for index in all_class_labels: sample_attribute_collection[index]=[] for sample in new_test_set.__iter__(): temp_attributes_list=sample.get_attributes_index_list() sample_attribute_collection[sample.current_class_label].append(temp_attributes_list) for w in xrange(w_tot): output.append([]) incomplete_test_set = new_test_set.induce_incompleteness(completeness[w]) err=0 for z in xrange(z_tot): #output[w][z].append([]) if round(contamination[z],1) == 0.0: results = test_configuration.classifier.test(incomplete_test_set.map_test_set_attributes_to_training_set(new_training_set),model) summary = ClassificationSummary(results) output[w].append(summary) elif len(sample_attribute_collection.keys())==2: #do crosscontamination if exactly 2 class labels given contaminated_test_set = incomplete_test_set.introduce_contamination(sample_attribute_collection,contamination[z]) contaminated_test_set = contaminated_test_set.map_test_set_attributes_to_training_set(new_training_set) results = test_configuration.classifier.test(contaminated_test_set,model) summary = ClassificationSummary(results) output[w].append(summary) if ( target_class != None ): print results.print_classification_log() print results if root_output: fout = open("%(root_output)s.r%(replicate_plus_one)d.v%(fold)d.%(test_name)s.features"%(locals()),"w") fout.write("\n".join(features)) fout.close() elif err==0: sys.stderr.write("Warning: skipping contamination of fold %i of replicate %i: exactly 2 different class labels needed!"%(fold,replicate)) err=1 #print(replicates[replicate][fold][w][z]) # if root_output: # fout = open("%(root_output)s.r%(replicate_plus_one)d.classification.log"%(locals()),"w") # header_fields = ["sample","fold",sample_set.current_class] # for test_configuration in test_configurations: # header_fields.append(test_configuration.name) # output_dictionary = {} # output_lines = ["\t".join(header_fields)] # for fold in xrange(v): # for classification_index in xrange(len(replicates[replicate][fold][0].classifications_list)): # main_sample_record = replicates[replicate][fold][0].classifications_list[classification_index] # output_line = [str(main_sample_record.who),str(fold+1),str(main_sample_record.true_class)] # for test_configuration_index in xrange(len(test_configurations)): # test_sample_record = replicates[replicate][fold][test_configuration_index].classifications_list[classification_index] # output_line.append(str(test_sample_record.predicted_class)) # output_lines.append("\t".join(output_line)) # fout.write("\n".join(output_lines)) # fout.close() print "Finished replicate %d, fold %d"%(replicate_plus_one,fold) return output