def __logLikelihoodContinuous(self, classLabel: str, instance: Instance) -> float: """ The logLikelihoodContinuous method takes an Instance and a class label as inputs. First it gets the logarithm of given class label's probability via prior distribution as logLikelihood. Then it loops times of given instance attribute size, and accumulates the logLikelihood by calculating -0.5 * ((xi - mi) / si )** 2). PARAMETERS ---------- classLabel : str String input class label. instance : Instance Instance input. RETURNS ------- float The log likelihood of given class label and Instance. """ loglikelihood = math.log( self.priorDistribution.getProbability(classLabel)) for i in range(instance.attributeSize()): xi = instance.getAttribute(i).getValue() mi = self.__classMeans[classLabel].getValue(i) si = self.__classDeviations[classLabel].getValue(i) if si != 0: loglikelihood += -0.5 * math.pow((xi - mi) / si, 2) return loglikelihood
def __checkDefinition(self, instance: Instance) -> bool: """ Checks the correctness of the attribute type, for instance, if the attribute of given instance is a Binary attribute, and the attribute type of the corresponding item of the data definition is also a Binary attribute, it then returns true, and false otherwise. PARAMETERS ---------- instance : Instance Instance to checks the attribute type. RETURNS ------- bool true if attribute types of given Instance and data definition matches. """ for i in range(instance.attributeSize()): if isinstance(instance.getAttribute(i), BinaryAttribute): if self.__definition.getAttributeType( i) is not AttributeType.BINARY: return False elif isinstance(instance.getAttribute(i), DiscreteIndexedAttribute): if self.__definition.getAttributeType( i) is not AttributeType.DISCRETE_INDEXED: return False elif isinstance(instance.getAttribute(i), DiscreteAttribute): if self.__definition.getAttributeType( i) is not AttributeType.DISCRETE: return False elif isinstance(instance.getAttribute(i), ContinuousAttribute): if self.__definition.getAttributeType( i) is not AttributeType.CONTINUOUS: return False return True
def __logLikelihoodDiscrete(self, classLabel: str, instance: Instance) -> float: """ The logLikelihoodDiscrete method takes an Instance and a class label as inputs. First it gets the logarithm of given class label's probability via prior distribution as logLikelihood and gets the class attribute distribution of given class label. Then it loops times of given instance attribute size, and accumulates the logLikelihood by calculating the logarithm of corresponding attribute distribution's smoothed probability by using laplace smoothing on xi. PARAMETERS ---------- classLabel : str String input class label. instance : Instance Instance input. RETURNS ------- float The log likelihood of given class label and Instance. """ loglikelihood = math.log( self.priorDistribution.getProbability(classLabel)) attributeDistributions = self.__classAttributeDistributions.get( classLabel) for i in range(instance.attributeSize()): xi = instance.getAttribute(i).getValue() loglikelihood += math.log( attributeDistributions[i].getProbabilityLaplaceSmoothing(xi)) return loglikelihood
def predict(self, instance: Instance) -> str: """ The predict method takes an Instance as an input. First it gets the size of prior distribution and loops this size times. Then it gets the possible class labels and and calculates metric value. At the end, it returns the class which has the maximum value of metric. PARAMETERS ---------- instance : Instance Instance to predict. RETURNS ------- str The class which has the maximum value of metric. """ maxMetric = -10000000 if isinstance(instance, CompositeInstance): predicatedClass = instance.getPossibleClassLabels()[0] size = len(instance.getPossibleClassLabels()) else: predicatedClass = self.priorDistribution.getMaxItem() size = len(self.priorDistribution) for i in range(size): if isinstance(instance, CompositeInstance): Ci = instance.getPossibleClassLabels()[i] else: Ci = self.priorDistribution.getItem(i) if self.priorDistribution.containsItem(Ci): metric = self.calculateMetric(instance, Ci) if metric > maxMetric: maxMetric = metric predicatedClass = Ci return predicatedClass
def distance(self, instance1: Instance, instance2: Instance) -> float: result = 0 for i in range(instance1.attributeSize()): if isinstance(instance1.getAttribute(i), DiscreteAttribute) and \ isinstance(instance2.getAttribute(i), DiscreteAttribute): if instance1.getAttribute(i).getValue() is not None and \ instance1.getAttribute(i).getValue() != instance2.getAttribute(i).getValue(): result += 1 else: if isinstance(instance1.getAttribute(i), ContinuousAttribute) and \ isinstance(instance2.getAttribute(i), ContinuousAttribute): result += math.pow( instance1.getAttribute(i).getValue() - instance2.getAttribute(i).getValue(), 2) return result
def generateInstanceFromSentence(self, sentence: Sentence, wordIndex: int) -> Instance: word = sentence.getWord(wordIndex) if isinstance(word, AnnotatedWord): classLabel = NamedEntityType.getNamedEntityString(word.getNamedEntityType()) current = Instance(classLabel) self.addAttributes(current, sentence, wordIndex) return current
def calculateRMinusY(self, instance: Instance, inputVector: Vector, weights: Matrix) -> Vector: """ The calculateRMinusY method creates a new Vector with given Instance, then it multiplies given input Vector with given weights Matrix. After normalizing the output, it return the difference between the newly created Vector and normalized output. PARAMETERS ---------- instance : Instance Instance is used to get class labels. inputVector : Vector Vector to multiply weights. weights : Matrix Matrix of weights RETURNS ------- Vector Difference between newly created Vector and normalized output. """ r = Vector() r.initAllZerosExceptOne( self.K, self.classLabels.index(instance.getClassLabel()), 1.0) o = weights.multiplyWithVectorFromRight(inputVector) y = self.normalizeOutput(o) return r.difference(y)
def nearestNeighbors(self, instance: Instance) -> InstanceList: """ The nearestNeighbors method takes an Instance as an input. First it gets the possible class labels, then loops through the data InstanceList and creates new list of KnnInstances and adds the corresponding data with the distance between data and given instance. After sorting this newly created list, it loops k times and returns the first k instances as an InstanceList. PARAMETERS ---------- instance : Instance Instance to find nearest neighbors RETURNS ------- InstanceList The first k instances which are nearest to the given instance as an InstanceList. """ result = InstanceList() instances = [] possibleClassLabels = [] if isinstance(instance, CompositeInstance): possibleClassLabels = instance.getPossibleClassLabels() for i in range(self.__data.size()): if not isinstance(instance, CompositeInstance) or self.__data.get( i).getClassLabel() in possibleClassLabels: instances.append( KnnInstance( self.__data.get(i), self.__distanceMetric.distance(self.__data.get(i), instance))) instances.sort(key=cmp_to_key(self.makeComparator())) for i in range(min(self.__k, len(instances))): result.add(instances[i].instance) return result
def generateInstanceFromSentence(self, sentence: Sentence, wordIndex: int) -> Instance: """ Generates a single classification instance of the morphological disambiguation problem for the given word of the given sentence. If the word does not have a morphological parse, the method throws InstanceNotGenerated. PARAMETERS ---------- sentence : Sentence Input sentence. wordIndex : int The index of the word in the sentence. RETURNS ------- Instance Classification instance. """ word = sentence.getWord(wordIndex) if isinstance(word, AnnotatedWord): current = Instance(word.getParse().getTransitionList()) for i in range(self.windowSize): if wordIndex - self.windowSize + i >= 0: self.addAttributesForPreviousWords( current, sentence, wordIndex - self.windowSize + i) else: self.addAttributesForEmptyWords(current, "<s>") self.addAttributesForPreviousWords(current, sentence, wordIndex) return current
def convertInstance(self, instance: Instance): """ The convertInstance method takes an Instance as an input and creates a Vector attributes from continuous Attributes. After removing all attributes of given instance, it then adds new ContinuousAttribute by using the dot product of attributes Vector and the eigenvectors. PARAMETERS ---------- instance : Instance Instance that will be converted to ContinuousAttribute by using eigenvectors. """ attributes = Vector(instance.continuousAttributes()) instance.removeAllAttributes() for eigenvector in self.__eigenvectors: instance.addAttribute( ContinuousAttribute(attributes.dotProduct(eigenvector)))
def convertInstance(self, instance: Instance): """ Normalizes the continuous attributes of a single instance. For all i, new x_i = (x_i - m_i) / s_i. PARAMETERS ---------- instance : Instance Instance whose attributes will be normalized. """ for i in range(instance.attributeSize()): if isinstance(instance.getAttribute(i), ContinuousAttribute): xi = instance.getAttribute(i) mi = self.__averageInstance.getAttribute(i) si = self.__standardDeviationInstance.getAttribute(i) if isinstance(xi, ContinuousAttribute): xi.setValue( (xi.getValue() - mi.getValue()) / si.getValue())
def discreteCheck(self, instance: Instance) -> bool: """ Checks given instance's attribute and returns true if it is a discrete indexed attribute, false otherwise. PARAMETERS ---------- instance Instance to check. RETURNS ------- bool True if instance is a discrete indexed attribute, false otherwise. """ for i in range(instance.attributeSize()): if isinstance(instance.getAttribute(i), DiscreteAttribute) and not isinstance(instance.getAttribute(i), DiscreteIndexedAttribute): return False return True
def removeDiscreteAttributesFromInstance(self, instance: Instance, size: int): """ The removeDiscreteAttributesFromInstance method takes an Instance as an input, and removes the discrete attributes from given instance. PARAMETERS ---------- instance : Instance Instance to removes attributes from. size : int Size of the given instance. """ k = 0 for i in range(size): if len(self.attributeDistributions[i]) > 0: instance.removeAttribute(k) else: k = k + 1
def convertInstance(self, instance: Instance): """ Converts discrete attributes of a single instance to indexed version. PARAMETERS ---------- instance : Instance The instance to be converted. """ size = instance.attributeSize() for i in range(size): if len(self.attributeDistributions[i]) > 0: index = self.attributeDistributions[i].getIndex( instance.getAttribute(i).__str__()) instance.addAttribute( DiscreteIndexedAttribute( instance.getAttribute(i).__str__(), index, len(self.attributeDistributions[i]))) self.removeDiscreteAttributesFromInstance(instance, size)
def createInputVector(self, instance: Instance): """ The createInputVector method takes an Instance as an input. It converts given Instance to the Vector and insert 1.0 to the first element. PARAMETERS ---------- instance : Instance Instance to insert 1.0. """ self.x = instance.toVector() self.x.insert(0, 1.0)
def distance(self, instance1: Instance, instance2: Instance) -> float: """ Calculates Mahalanobis distance between two instances. (x^(1) - x^(2)) S (x^(1) - x^(2))^T PARAMETERS ---------- instance1 : Instance First instance. instance2 : Instance Second instance. RETURNS ------- float Mahalanobis distance between two instances. """ v1 = instance1.toVector() v2 = instance2.toVector() v1.subtract(v2) v3 = self.__covarianceInverse.multiplyWithVectorFromLeft(v1) return v3.dotProduct(v1)
def __setDefinition(self, instance: Instance): """ Adds the attribute types according to given Instance. For instance, if the attribute type of given Instance is a Discrete type, it than adds a discrete attribute type to the list of attribute types. PARAMETERS ---------- instance : Instance Instance input. """ attributeTypes = [] for i in range(instance.attributeSize()): if isinstance(instance.getAttribute(i), BinaryAttribute): attributeTypes.append(AttributeType.BINARY) elif isinstance(instance.getAttribute(i), DiscreteIndexedAttribute): attributeTypes.append(AttributeType.DISCRETE_INDEXED) elif isinstance(instance.getAttribute(i), DiscreteAttribute): attributeTypes.append(AttributeType.DISCRETE) elif isinstance(instance.getAttribute(i), ContinuousAttribute): attributeTypes.append(AttributeType.CONTINUOUS) self.__definition = DataDefinition(attributeTypes)
def satisfy(self, instance: Instance): """ The satisfy method takes an Instance as an input. If defined Attribute value is a DiscreteIndexedAttribute it compares the index of Attribute of instance at the attributeIndex and the index of Attribute value and returns the result. If defined Attribute value is a DiscreteAttribute it compares the value of Attribute of instance at the attributeIndex and the value of Attribute value and returns the result. If defined Attribute value is a ContinuousAttribute it compares the value of Attribute of instance at the attributeIndex and the value of Attribute value and returns the result according to the comparison character whether it is less than or greater than signs. PARAMETERS ---------- instance : Instance Instance to compare. RETURNS ------- bool True if gicen instance satisfies the conditions. """ if isinstance(self.__value, DiscreteIndexedAttribute): if self.__value.getIndex() != -1: return instance.getAttribute(self.__attributeIndex).getIndex() == self.__value.getIndex() else: return True elif isinstance(self.__value, DiscreteAttribute): return instance.getAttribute(self.__attributeIndex).getValue() == self.__value.getValue() elif isinstance(self.__value, ContinuousAttribute): if self.__comparison == "<": return instance.getAttribute(self.__attributeIndex).getValue() <= self.__value.getValue() else: return instance.getAttribute(self.__attributeIndex).getValue() > self.__value.getValue() return False
def initWithFile(self, fileName: str): """ Constructor for generating a new DataSet from given File. PARAMETERS ---------- fileName : str File to generate DataSet from. """ self.__instances = InstanceList() self.__definition = DataDefinition() inputFile = open(fileName, 'r', encoding='utf8') lines = inputFile.readlines() i = 0 for line in lines: attributes = line.split(",") if i == 0: for j in range(len(attributes) - 1): try: float(attributes[j]) self.__definition.addAttribute( AttributeType.CONTINUOUS) except: self.__definition.addAttribute(AttributeType.DISCRETE) else: if len(attributes) != self.__definition.attributeCount() + 1: continue if ";" not in attributes[len(attributes) - 1]: instance = Instance(attributes[len(attributes) - 1]) else: labels = attributes[len(attributes) - 1].split(";") instance = CompositeInstance(labels[0], None, labels) for j in range(len(attributes) - 1): if self.__definition.getAttributeType( j) is AttributeType.CONTINUOUS: instance.addAttribute( ContinuousAttribute(float(attributes[j]))) elif self.__definition.getAttributeType( j) is AttributeType.DISCRETE: instance.addAttribute(DiscreteAttribute(attributes[j])) if instance.attributeSize() == self.__definition.attributeCount(): self.__instances.add(instance) i = i + 1
def predict(self, instance: Instance) -> str: """ The predict method performs prediction on the root node of given instance, and if it is null, it returns the possible class labels. Otherwise it returns the returned class labels. PARAMETERS ---------- instance : Instance Instance make prediction. RETURNS ------- str Possible class labels. """ predictedClass = self.__root.predict(instance) if predictedClass is None and isinstance(instance, CompositeInstance): predictedClass = instance.getPossibleClassLabels() return predictedClass
def predict(self, instance: Instance) -> str: """ The predict method takes an Instance as an input and returns the entry of distribution which has the maximum value. PARAMETERS ---------- instance : Instance Instance to make prediction. RETURNS ------- str The entry of distribution which has the maximum value. """ if isinstance(instance, CompositeInstance): possibleClassLabels = instance.getPossibleClassLabels() return self.distribution.getMaxItemIncludeTheseOnly( possibleClassLabels) else: return self.distribution.getMaxItem()
def calculateMetric(self, instance: Instance, Ci: str) -> float: """ The calculateMetric method takes an Instance and a String as inputs. It returns the dot product of given Instance and wi plus w0i. PARAMETERS ---------- instance : Instance Instance input. Ci : str String input. RETURNS ------- float The dot product of given Instance and wi plus w0i. """ xi = instance.toVector() wi = self.w[Ci] w0i = self.w0[Ci] return wi.dotProduct(xi) + w0i
def predict(self, instance: Instance) -> str: """ The predict method takes an Instance as an input, converts it to a Vector and calculates the Matrix y by multiplying Matrix W with Vector x. Then it returns the class label which has the maximum y value. PARAMETERS ---------- instance : Instance Instance to predict. RETURNS ------- str The class label which has the maximum y. """ self.createInputVector(instance) self.calculateOutput() if isinstance(instance, CompositeInstance): return self.predictWithCompositeInstance( instance.getPossibleClassLabels()) else: return self.classLabels[self.y.maxIndex()]
def generateInstanceFromSentence(self, sentence: Sentence, wordIndex: int) -> Instance: """ Generates a single classification instance of the Shallow Parse problem for the given word of the given sentence. If the word has not been labeled with shallow parse tag yet, the method returns null. PARAMETERS ---------- sentence : Sentence Input sentence. wordIndex : int The index of the word in the sentence. RETURNS ------- Instance Classification instance. """ word = sentence.getWord(wordIndex) if isinstance(word, AnnotatedWord): classLabel = word.getShallowParse() current = Instance(classLabel) self.addAttributes(current, sentence, wordIndex) return current
def predict(self, instance: Instance) -> str: """ The predict method takes an Instance as an input and finds the nearest neighbors of given instance. Then it returns the first possible class label as the predicted class. PARAMETERS ---------- instance : Instance Instance to make prediction. RETURNS ------- str The first possible class label as the predicted class. """ nearestNeighbors = self.nearestNeighbors(instance) if isinstance(instance, CompositeInstance) and nearestNeighbors.size() == 0: predictedClass = instance.getPossibleClassLabels()[0] else: predictedClass = Model.getMaximum( nearestNeighbors.getClassLabels()) return predictedClass
def predict(self, instance: Instance) -> str: """ The predict method gets an Instance as an input and retrieves the possible class labels as an ArrayList. Then selects a random number as an index and returns the class label at this selected index. PARAMETERS ---------- instance : Instance Instance to make prediction. RETURNS ------- str The class label at the randomly selected index. """ if isinstance(instance, CompositeInstance): possibleClassLabels = instance.getPossibleClassLabels() size = len(possibleClassLabels) index = random.randint(0, size) return possibleClassLabels[index] else: size = len(self.__classLabels) index = random.randrange(size) return self.__classLabels[index]
def calculateMetric(self, instance: Instance, Ci: str) -> float: """ The calculateMetric method takes an Instance and a String as inputs. It multiplies Matrix Wi with Vector xi then calculates the dot product of it with xi. Then, again it finds the dot product of wi and xi and returns the summation with w0i. PARAMETERS ---------- instance : Instance Instance input. Ci : str String input. RETURNS ------- float The result of Wi.multiplyWithVectorFromLeft(xi).dotProduct(xi) + wi.dotProduct(xi) + w0i. """ xi = instance.toVector() Wi = self.__W[Ci] wi = self.w[Ci] w0i = self.w0[Ci] return Wi.multiplyWithVectorFromLeft(xi).dotProduct( xi) + wi.dotProduct(xi) + w0i
def convertInstance(self, instance: Instance): """ Converts discrete attributes of a single instance to continuous version using 1-of-L encoding. For example, if an attribute has values red, green, blue; this attribute will be converted to 3 continuous attributes where red will have the value 100, green will have the value 010, and blue will have the value 001. PARAMETERS ---------- instance : Instance The instance to be converted. """ size = instance.attributeSize() for i in range(size): if len(self.attributeDistributions[i]) > 0: index = self.attributeDistributions[i].getIndex( instance.getAttribute(i).__str__()) for j in range(len(self.attributeDistributions[i])): if j != index: instance.addAttribute(ContinuousAttribute(0)) else: instance.addAttribute(ContinuousAttribute(1)) self.removeDiscreteAttributesFromInstance(instance, size)