def __logLikelihoodContinuous(self, classLabel: str, instance: Instance) -> float: """ The logLikelihoodContinuous method takes an Instance and a class label as inputs. First it gets the logarithm of given class label's probability via prior distribution as logLikelihood. Then it loops times of given instance attribute size, and accumulates the logLikelihood by calculating -0.5 * ((xi - mi) / si )** 2). PARAMETERS ---------- classLabel : str String input class label. instance : Instance Instance input. RETURNS ------- float The log likelihood of given class label and Instance. """ loglikelihood = math.log( self.priorDistribution.getProbability(classLabel)) for i in range(instance.attributeSize()): xi = instance.getAttribute(i).getValue() mi = self.__classMeans[classLabel].getValue(i) si = self.__classDeviations[classLabel].getValue(i) if si != 0: loglikelihood += -0.5 * math.pow((xi - mi) / si, 2) return loglikelihood
def __checkDefinition(self, instance: Instance) -> bool: """ Checks the correctness of the attribute type, for instance, if the attribute of given instance is a Binary attribute, and the attribute type of the corresponding item of the data definition is also a Binary attribute, it then returns true, and false otherwise. PARAMETERS ---------- instance : Instance Instance to checks the attribute type. RETURNS ------- bool true if attribute types of given Instance and data definition matches. """ for i in range(instance.attributeSize()): if isinstance(instance.getAttribute(i), BinaryAttribute): if self.__definition.getAttributeType( i) is not AttributeType.BINARY: return False elif isinstance(instance.getAttribute(i), DiscreteIndexedAttribute): if self.__definition.getAttributeType( i) is not AttributeType.DISCRETE_INDEXED: return False elif isinstance(instance.getAttribute(i), DiscreteAttribute): if self.__definition.getAttributeType( i) is not AttributeType.DISCRETE: return False elif isinstance(instance.getAttribute(i), ContinuousAttribute): if self.__definition.getAttributeType( i) is not AttributeType.CONTINUOUS: return False return True
def __logLikelihoodDiscrete(self, classLabel: str, instance: Instance) -> float: """ The logLikelihoodDiscrete method takes an Instance and a class label as inputs. First it gets the logarithm of given class label's probability via prior distribution as logLikelihood and gets the class attribute distribution of given class label. Then it loops times of given instance attribute size, and accumulates the logLikelihood by calculating the logarithm of corresponding attribute distribution's smoothed probability by using laplace smoothing on xi. PARAMETERS ---------- classLabel : str String input class label. instance : Instance Instance input. RETURNS ------- float The log likelihood of given class label and Instance. """ loglikelihood = math.log( self.priorDistribution.getProbability(classLabel)) attributeDistributions = self.__classAttributeDistributions.get( classLabel) for i in range(instance.attributeSize()): xi = instance.getAttribute(i).getValue() loglikelihood += math.log( attributeDistributions[i].getProbabilityLaplaceSmoothing(xi)) return loglikelihood
def distance(self, instance1: Instance, instance2: Instance) -> float: result = 0 for i in range(instance1.attributeSize()): if isinstance(instance1.getAttribute(i), DiscreteAttribute) and \ isinstance(instance2.getAttribute(i), DiscreteAttribute): if instance1.getAttribute(i).getValue() is not None and \ instance1.getAttribute(i).getValue() != instance2.getAttribute(i).getValue(): result += 1 else: if isinstance(instance1.getAttribute(i), ContinuousAttribute) and \ isinstance(instance2.getAttribute(i), ContinuousAttribute): result += math.pow( instance1.getAttribute(i).getValue() - instance2.getAttribute(i).getValue(), 2) return result
def convertInstance(self, instance: Instance): """ Normalizes the continuous attributes of a single instance. For all i, new x_i = (x_i - m_i) / s_i. PARAMETERS ---------- instance : Instance Instance whose attributes will be normalized. """ for i in range(instance.attributeSize()): if isinstance(instance.getAttribute(i), ContinuousAttribute): xi = instance.getAttribute(i) mi = self.__averageInstance.getAttribute(i) si = self.__standardDeviationInstance.getAttribute(i) if isinstance(xi, ContinuousAttribute): xi.setValue( (xi.getValue() - mi.getValue()) / si.getValue())
def discreteCheck(self, instance: Instance) -> bool: """ Checks given instance's attribute and returns true if it is a discrete indexed attribute, false otherwise. PARAMETERS ---------- instance Instance to check. RETURNS ------- bool True if instance is a discrete indexed attribute, false otherwise. """ for i in range(instance.attributeSize()): if isinstance(instance.getAttribute(i), DiscreteAttribute) and not isinstance(instance.getAttribute(i), DiscreteIndexedAttribute): return False return True
def convertInstance(self, instance: Instance): """ Converts discrete attributes of a single instance to indexed version. PARAMETERS ---------- instance : Instance The instance to be converted. """ size = instance.attributeSize() for i in range(size): if len(self.attributeDistributions[i]) > 0: index = self.attributeDistributions[i].getIndex( instance.getAttribute(i).__str__()) instance.addAttribute( DiscreteIndexedAttribute( instance.getAttribute(i).__str__(), index, len(self.attributeDistributions[i]))) self.removeDiscreteAttributesFromInstance(instance, size)
def initWithFile(self, fileName: str): """ Constructor for generating a new DataSet from given File. PARAMETERS ---------- fileName : str File to generate DataSet from. """ self.__instances = InstanceList() self.__definition = DataDefinition() inputFile = open(fileName, 'r', encoding='utf8') lines = inputFile.readlines() i = 0 for line in lines: attributes = line.split(",") if i == 0: for j in range(len(attributes) - 1): try: float(attributes[j]) self.__definition.addAttribute( AttributeType.CONTINUOUS) except: self.__definition.addAttribute(AttributeType.DISCRETE) else: if len(attributes) != self.__definition.attributeCount() + 1: continue if ";" not in attributes[len(attributes) - 1]: instance = Instance(attributes[len(attributes) - 1]) else: labels = attributes[len(attributes) - 1].split(";") instance = CompositeInstance(labels[0], None, labels) for j in range(len(attributes) - 1): if self.__definition.getAttributeType( j) is AttributeType.CONTINUOUS: instance.addAttribute( ContinuousAttribute(float(attributes[j]))) elif self.__definition.getAttributeType( j) is AttributeType.DISCRETE: instance.addAttribute(DiscreteAttribute(attributes[j])) if instance.attributeSize() == self.__definition.attributeCount(): self.__instances.add(instance) i = i + 1
def __setDefinition(self, instance: Instance): """ Adds the attribute types according to given Instance. For instance, if the attribute type of given Instance is a Discrete type, it than adds a discrete attribute type to the list of attribute types. PARAMETERS ---------- instance : Instance Instance input. """ attributeTypes = [] for i in range(instance.attributeSize()): if isinstance(instance.getAttribute(i), BinaryAttribute): attributeTypes.append(AttributeType.BINARY) elif isinstance(instance.getAttribute(i), DiscreteIndexedAttribute): attributeTypes.append(AttributeType.DISCRETE_INDEXED) elif isinstance(instance.getAttribute(i), DiscreteAttribute): attributeTypes.append(AttributeType.DISCRETE) elif isinstance(instance.getAttribute(i), ContinuousAttribute): attributeTypes.append(AttributeType.CONTINUOUS) self.__definition = DataDefinition(attributeTypes)
def convertInstance(self, instance: Instance): """ Converts discrete attributes of a single instance to continuous version using 1-of-L encoding. For example, if an attribute has values red, green, blue; this attribute will be converted to 3 continuous attributes where red will have the value 100, green will have the value 010, and blue will have the value 001. PARAMETERS ---------- instance : Instance The instance to be converted. """ size = instance.attributeSize() for i in range(size): if len(self.attributeDistributions[i]) > 0: index = self.attributeDistributions[i].getIndex( instance.getAttribute(i).__str__()) for j in range(len(self.attributeDistributions[i])): if j != index: instance.addAttribute(ContinuousAttribute(0)) else: instance.addAttribute(ContinuousAttribute(1)) self.removeDiscreteAttributesFromInstance(instance, size)