def _makeInstance(self, i, instances=None): inst = Instance(len(i)) if instances is not None: inst.setDataset(instances) for (attName, value) in i.iteritems(): if attName in self.numericAttributes: value = Double(value) else: value = String(value) attr = self.attName2Obj[attName] #print self.attName2Domain #print "attName, value", attName, value inst.setValue(attr, value) return inst
def build_instances(state,dataset): class_attributes = ["Sunny", "Fog", "Rain", "Snow", "Hail", "Thunder", "Tornado"] header = ["state","lat", "lon", "day","temp","dewp","weather"] #build attributes based on the header and types attributes = [] for h in header[:-1]: attributes.append(Attribute(h)) #add the classification attribute classification_vector = FastVector(len(class_attributes)) for c in class_attributes: classification_vector.addElement(c) attributes.append(Attribute("toClassify", classification_vector)) fvWekaAttributes = FastVector(len(dataset[0])) for a in attributes: fvWekaAttributes.addElement(a) training_set = Instances("C4.5Set", fvWekaAttributes, len(dataset)) training_set.setClassIndex(len(header)-1) for d in dataset: inst = Instance(len(d)) for i in range(len(d)-1): try: inst.setValue(fvWekaAttributes.elementAt(i), float(d[i])) except: pass #print "failed on", i, d[i], d[i].__class__ inst.setValue(fvWekaAttributes.elementAt(len(d)-1), d[-1]) training_set.add(inst) j48 = J48() j48.buildClassifier(training_set) return state,parse_tree(str(j48))
class JeroR(Classifier, JythonSerializableObject): """ JeroR is a Jython implementation of the Weka classifier ZeroR 'author' -- FracPete (fracpete at waikato dot ac dot nz) 'version' -- $Revision$ """ # the documentation can be generated with HappyDoc: # http://happydoc.sourceforge.net/ # Example command: # happydoc --title Weka -d ./doc ./src # the chosen class value __ClassValue = Instance.missingValue() # the class attribute __Class = None # the counts for each class label __Counts = None def listOptions(self): """ Returns an enumeration describing the available options. Return: an enumeration of all the available options. """ return Classifier.listOptions(self) def setOptions(self, options): """ Parses a given list of options. Parameter(s): 'options' -- the list of options as an array of strings """ Classifier.setOptions(self, options) return def getOptions(self): """ Gets the current settings of the Classifier as string array. Return: an array of strings suitable for passing to setOptions """ return Classifier.getOptions(self) def getCapabilities(self): """ returns the capabilities of this classifier Return: the capabilities of this classifier """ result = Classifier.getCapabilities(self) # attributes result.enable(Capability.NOMINAL_ATTRIBUTES) result.enable(Capability.NUMERIC_ATTRIBUTES) result.enable(Capability.DATE_ATTRIBUTES) result.enable(Capability.STRING_ATTRIBUTES) result.enable(Capability.RELATIONAL_ATTRIBUTES) result.enable(Capability.MISSING_VALUES) # class result.enable(Capability.NOMINAL_CLASS) result.enable(Capability.NUMERIC_CLASS) result.enable(Capability.DATE_CLASS) result.enable(Capability.MISSING_CLASS_VALUES) # instances result.setMinimumNumberInstances(0) return result def buildClassifier(self, instances): """ builds the ZeroR classifier with the given data Parameter(s): 'instances' -- the data to build the classifier from """ self.getCapabilities().testWithFail(instances) # remove instances with missing class instances = Instances(instances) instances.deleteWithMissingClass() sumOfWeights = 0 self.__Class = instances.classAttribute() self.__ClassValue = 0 self.__Counts = None if (instances.classAttribute().isNumeric()): self.__Counts = None elif (instances.classAttribute().isNominal()): self.__Counts = jarray.zeros(instances.numClasses(), 'd') for i in range(len(self.__Counts)): self.__Counts[i] = 1 sumOfWeights = instances.numClasses() enu = instances.enumerateInstances() while (enu.hasMoreElements()): instance = enu.nextElement() if (not instance.classIsMissing()): if (instances.classAttribute().isNominal()): self.__Counts[int( instance.classValue())] += instance.weight() else: self.__ClassValue += instance.weight( ) * instance.classValue() sumOfWeights += instance.weight() if (instances.classAttribute().isNumeric()): if (Utils.gr(sumOfWeights, 0)): self.__ClassValue /= sumOfWeights else: self.__ClassValue = Utils.maxIndex(self.__Counts) Utils.normalize(self.__Counts, sumOfWeights) return def classifyInstance(self, instance): """ returns the prediction for the given instance Parameter(s): 'instance' -- the instance to predict the class value for Return: the prediction for the given instance """ return self.__ClassValue def distributionForInstance(self, instance): """ returns the class distribution for the given instance Parameter(s): 'instance' -- the instance to calculate the class distribution for Return: the class distribution for the given instance """ result = None if (self.__Counts == None): result = jarray.zeros(1, 'd') result[0] = self.__ClassValue else: result = self.__Counts[:] return result def toString(self): """ Prints a string representation of the classifier Return: string representation of the classifier """ if (self.__Class == None): return "JeroR: No model built yet." if (self.__Counts == None): return "JeroR predicts class value: " + str(self.__ClassValue) else: return "JeroR predicts class value: " + str( self.__Class.value(int(self.__ClassValue)))
def processThread(self, tempCursor, postChain, data, dataNum, j48, linreg): if len(postChain) < 2: return # return if there are no replies threadSoFar = [] self.threadWrapper_add(threadSoFar, postChain[0], postChain[1]) preprocessCorpus([item[3] for item in postChain]) dbHandler.insert(tempCursor, postChain[0][0], postChain[1][0]) for i in xrange(2, len(postChain)): child = [postChain[i][0], []] hiLinreg = [[-1, -1.0], [-1, -1.0]] # [postid, linregValue] posJ48 = [-1, 0] # [postid, sum of yields] for j in xrange(0, i): self.threadWrapper_add(threadSoFar, postChain[j], postChain[i]) time, cos, quoted, newestparent, dist, pos, lastown, diaLen, diaDist, prevsparent, firstpost = self.computeFeatures(threadSoFar) threadSoFar = threadSoFar[:-1] postPair = Instance(data.numAttributes()) postPair.setValue(0, time) postPair.setValue(1, cos) postPair.setValue(2, quoted) postPair.setValue(3, newestparent) postPair.setValue(4, dist) postPair.setValue(5, pos) postPair.setValue(6, data.attribute(6).indexOfValue(lastown)) postPair.setValue(7, diaLen) postPair.setValue(8, diaDist) postPair.setValue(9, prevsparent) postPair.setValue(10, firstpost) postPair.setDataset(dataNum) linregValue = linreg.classifyInstance(postPair) if linregValue > hiLinreg[0][1]: hiLinreg[1] = hiLinreg[0] hiLinreg[0] = [postChain[j][0], linregValue] else: if linregValue > hiLinreg[1][1]: hiLinreg[1] = [postChain[j][0], linregValue] postPair.setDataset(data) j48Class = j48.classifyInstance(postPair) if j48Class == 1.0: posJ48 = [postChain[j][0], posJ48[1]+1] if child[1] == []: if posJ48[1] == 0: child[1] = [hiLinreg[1][0]] else: if posJ48[1] > 1: child[1] = [hiLinreg[0][0]] else: child[1] = [posJ48[0]] self.threadWrapper_add(threadSoFar, [item for item in postChain if item[0] == child[1][0]][0], postChain[i]) dbHandler.insert(tempCursor, child[1][0], child[0])