def train(self, trainSet: InstanceList, parameters: Parameter): """ Training algorithm for the linear discriminant analysis classifier (Introduction to Machine Learning, Alpaydin, 2015). PARAMETERS ---------- trainSet : InstanceList Training data given to the algorithm. parameters : Parameter Parameter of the Lda algorithm. """ w0 = {} w = {} priorDistribution = trainSet.classDistribution() classLists = Partition(trainSet) covariance = Matrix(trainSet.get(0).continuousAttributeSize(), trainSet.get(0).continuousAttributeSize()) for i in range(classLists.size()): averageVector = Vector(classLists.get(i).continuousAverage()) classCovariance = classLists.get(i).covariance(averageVector) classCovariance.multiplyWithConstant(classLists.get(i).size() - 1) covariance.add(classCovariance) covariance.divideByConstant(trainSet.size() - classLists.size()) covariance.inverse() for i in range(classLists.size()): Ci = classLists.get(i).getClassLabel() averageVector = Vector(classLists.get(i).continuousAverage()) wi = covariance.multiplyWithVectorFromRight(averageVector) w[Ci] = wi w0i = -0.5 * wi.dotProduct(averageVector) + math.log(priorDistribution.getProbability(Ci)) w0[Ci] = w0i self.model = LdaModel(priorDistribution, w, w0)
def runExperiment(self, classifier: Classifier, parameter: Parameter, experimentPerformance: ExperimentPerformance, crossValidation: CrossValidation): for i in range(self.K): trainSet = InstanceList(crossValidation.getTrainFold(i)) testSet = InstanceList(crossValidation.getTestFold(i)) classifier.train(trainSet, parameter) experimentPerformance.add(classifier.test(testSet))
def nearestNeighbors(self, instance: Instance) -> InstanceList: """ The nearestNeighbors method takes an Instance as an input. First it gets the possible class labels, then loops through the data InstanceList and creates new list of KnnInstances and adds the corresponding data with the distance between data and given instance. After sorting this newly created list, it loops k times and returns the first k instances as an InstanceList. PARAMETERS ---------- instance : Instance Instance to find nearest neighbors RETURNS ------- InstanceList The first k instances which are nearest to the given instance as an InstanceList. """ result = InstanceList() instances = [] possibleClassLabels = [] if isinstance(instance, CompositeInstance): possibleClassLabels = instance.getPossibleClassLabels() for i in range(self.__data.size()): if not isinstance(instance, CompositeInstance) or self.__data.get( i).getClassLabel() in possibleClassLabels: instances.append( KnnInstance( self.__data.get(i), self.__distanceMetric.distance(self.__data.get(i), instance))) instances.sort(key=cmp_to_key(self.makeComparator())) for i in range(min(self.__k, len(instances))): result.add(instances[i].instance) return result
def train(self, trainSet: InstanceList, parameters: KMeansParameter): priorDistribution = trainSet.classDistribution() classMeans = InstanceList() classLists = Partition(trainSet) for i in range(classLists.size()): classMeans.add(classLists.get(i).average()) self.model = KMeansModel(priorDistribution, classMeans, parameters.getDistanceMetric())
def __init__(self, trainSet: InstanceList): """ Constructor that sets the class labels, their sizes as K and the size of the continuous attributes as d. PARAMETERS ---------- trainSet : InstanceList InstanceList to use as train set. """ self.classLabels = trainSet.getDistinctClassLabels() self.K = len(self.classLabels) self.d = trainSet.get(0).continuousAttributeSize()
def __init__(self, trainSet: InstanceList, validationSet: InstanceList, parameters: MultiLayerPerceptronParameter): """ A constructor that takes InstanceLists as trainsSet and validationSet. It sets the NeuralNetworkModel nodes with given InstanceList then creates an input vector by using given trainSet and finds error. Via the validationSet it finds the classification performance and reassigns the allocated weight Matrix with the matrix that has the best accuracy and the Matrix V with the best Vector input. PARAMETERS ---------- trainSet : InstanceList InstanceList that is used to train. validationSet : InstanceList InstanceList that is used to validate. parameters : MultiLayerPerceptronParameter Multi layer perceptron parameters; seed, learningRate, etaDecrease, crossValidationRatio, epoch, hiddenNodes. """ super().initWithTrainSet(trainSet) self.__allocateWeights(parameters.getHiddenNodes(), parameters.getSeed()) bestW = copy.deepcopy(self.W) bestV = copy.deepcopy(self.__V) bestClassificationPerformance = ClassificationPerformance(0.0) epoch = parameters.getEpoch() learningRate = parameters.getLearningRate() for i in range(epoch): trainSet.shuffle(parameters.getSeed()) for j in range(trainSet.size()): self.createInputVector(trainSet.get(j)) hidden = self.calculateHidden(self.x, self.W) hiddenBiased = hidden.biased() rMinusY = self.calculateRMinusY(trainSet.get(j), hiddenBiased, self.__V) deltaV = Matrix(rMinusY, hiddenBiased) oneMinusHidden = self.calculateOneMinusHidden(hidden) tmph = self.__V.multiplyWithVectorFromLeft(rMinusY) tmph.remove(0) tmpHidden = oneMinusHidden.elementProduct(hidden.elementProduct(tmph)) deltaW = Matrix(tmpHidden, self.x) deltaV.multiplyWithConstant(learningRate) self.__V.add(deltaV) deltaW.multiplyWithConstant(learningRate) self.W.add(deltaW) currentClassificationPerformance = self.testClassifier(validationSet) if currentClassificationPerformance.getAccuracy() > bestClassificationPerformance.getAccuracy(): bestClassificationPerformance = currentClassificationPerformance bestW = copy.deepcopy(self.W) bestV = copy.deepcopy(self.__V) learningRate *= parameters.getEtaDecrease() self.W = bestW self.__V = bestV
def train(self, trainSet: InstanceList, parameters: Parameter): """ Training algorithm for the quadratic discriminant analysis classifier (Introduction to Machine Learning, Alpaydin, 2015). PARAMETERS ---------- trainSet : InstanceList Training data given to the algorithm. """ w0 = {} w = {} W = {} classLists = Partition(trainSet) priorDistribution = trainSet.classDistribution() for i in range(classLists.size()): Ci = classLists.get(i).getClassLabel() averageVector = Vector(classLists.get(i).continuousAverage()) classCovariance = classLists.get(i).covariance(averageVector) determinant = classCovariance.determinant() classCovariance.inverse() Wi = deepcopy(classCovariance) Wi.multiplyWithConstant(-0.5) W[Ci] = Wi wi = classCovariance.multiplyWithVectorFromLeft(averageVector) w[Ci] = wi w0i = -0.5 * (wi.dotProduct(averageVector) + math.log(determinant)) + math.log(priorDistribution. getProbability(Ci)) w0[Ci] = w0i self.model = QdaModel(priorDistribution, W, w, w0)
def initWithFile(self, fileName: str): """ Constructor for generating a new DataSet from given File. PARAMETERS ---------- fileName : str File to generate DataSet from. """ self.__instances = InstanceList() self.__definition = DataDefinition() inputFile = open(fileName, 'r', encoding='utf8') lines = inputFile.readlines() i = 0 for line in lines: attributes = line.split(",") if i == 0: for j in range(len(attributes) - 1): try: float(attributes[j]) self.__definition.addAttribute( AttributeType.CONTINUOUS) except: self.__definition.addAttribute(AttributeType.DISCRETE) else: if len(attributes) != self.__definition.attributeCount() + 1: continue if ";" not in attributes[len(attributes) - 1]: instance = Instance(attributes[len(attributes) - 1]) else: labels = attributes[len(attributes) - 1].split(";") instance = CompositeInstance(labels[0], None, labels) for j in range(len(attributes) - 1): if self.__definition.getAttributeType( j) is AttributeType.CONTINUOUS: instance.addAttribute( ContinuousAttribute(float(attributes[j]))) elif self.__definition.getAttributeType( j) is AttributeType.DISCRETE: instance.addAttribute(DiscreteAttribute(attributes[j])) if instance.attributeSize() == self.__definition.attributeCount(): self.__instances.add(instance) i = i + 1
def __init__(self, trainSet: InstanceList): """ Constructor which sets the distribution using the given InstanceList. PARAMETERS ---------- trainSet : InstanceList InstanceList which is used to get the class distribution. """ self.distribution = trainSet.classDistribution()
def train(self, trainSet: InstanceList, parameters: Parameter): """ Training algorithm for random classifier. PARAMETERS ---------- trainSet : InstanceList Training data given to the algorithm. """ self.model = RandomModel(list(trainSet.classDistribution().keys()), parameters.getSeed())
def test(self, testSet: InstanceList) -> Performance: """ TestClassification an instance list with the current model. PARAMETERS ---------- testSet : InstaceList Test data (list of instances) to be tested. RETURNS ------- Performance The accuracy (and error) of the model as an instance of Performance class. """ classLabels = testSet.getUnionOfPossibleClassLabels() confusion = ConfusionMatrix(classLabels) for i in range(testSet.size()): instance = testSet.get(i) confusion.classify(instance.getClassLabel(), self.model.predict(instance)) return DetailedClassificationPerformance(confusion)
def __init__(self, trainSet: InstanceList, validationSet: InstanceList, parameters: LinearPerceptronParameter): """ Constructor that takes InstanceLists as trainsSet and validationSet. Initially it allocates layer weights, then creates an input vector by using given trainSet and finds error. Via the validationSet it finds the classification performance and at the end it reassigns the allocated weight Matrix with the matrix that has the best accuracy. PARAMETERS ---------- trainSet : InstanceList InstanceList that is used to train. validationSet : InstanceList InstanceList that is used to validate. parameters : LinearPerceptronParameter Linear perceptron parameters; learningRate, etaDecrease, crossValidationRatio, epoch. """ super().__init__(trainSet) self.W = self.allocateLayerWeights(self.K, self.d + 1, parameters.getSeed()) bestW = copy.deepcopy(self.W) bestClassificationPerformance = ClassificationPerformance(0.0) epoch = parameters.getEpoch() learningRate = parameters.getLearningRate() for i in range(epoch): trainSet.shuffle(parameters.getSeed()) for j in range(trainSet.size()): self.createInputVector(trainSet.get(j)) rMinusY = self.calculateRMinusY(trainSet.get(j), self.x, self.W) deltaW = Matrix(rMinusY, self.x) deltaW.multiplyWithConstant(learningRate) self.W.add(deltaW) currentClassificationPerformance = self.testClassifier( validationSet) if currentClassificationPerformance.getAccuracy( ) > bestClassificationPerformance.getAccuracy(): bestClassificationPerformance = currentClassificationPerformance bestW = copy.deepcopy(self.W) learningRate *= parameters.getEtaDecrease() self.W = bestW
def __init__(self, definition: DataDefinition = None, separator: str = None, fileName: str = None): """ Constructor for generating a new DataSet with given DataDefinition. PARAMETERS ---------- definition : DataDefinition Data definition of the data set. separator : str Separator character which separates the attribute values in the data file. fileName : str Name of the data set file. """ self.__definition = definition if separator is None: self.__instances = InstanceList() else: self.__instances = InstanceList(definition, separator, fileName)
def train(self, trainSet: InstanceList, parameters: RandomForestParameter): """ Training algorithm for random forest classifier. Basically the algorithm creates K distinct decision trees from K bootstrap samples of the original training set. PARAMETERS ---------- trainSet : InstanceList Training data given to the algorithm parameters : RandomForestParameter Parameters of the bagging trees algorithm. ensembleSize returns the number of trees in the random forest. """ forestSize = parameters.getEnsembleSize() forest = [] for i in range(forestSize): bootstrap = trainSet.bootstrap(i) tree = DecisionTree( DecisionNode(InstanceList(bootstrap.getSample()), None, parameters, False)) forest.append(tree) self.model = TreeEnsembleModel(forest)
def testClassifier(self, data: InstanceList) -> ClassificationPerformance: """ The testClassifier method takes an InstanceList as an input and returns an accuracy value as ClassificationPerformance. PARAMETERS ---------- data : InstanceList InstanceList to test. RETURNS ------- ClassificationPerformance Accuracy value as ClassificationPerformance. """ total = data.size() count = 0 for i in range(data.size()): if data.get(i).getClassLabel() == self.predict(data.get(i)): count = count + 1 return ClassificationPerformance(count / total)
def execute(self, experiment: Experiment) -> Performance: """ Execute Stratified Single K-fold cross-validation with the given classifier on the given data set using the given parameters. PARAMETERS ---------- experiment : Experiment Experiment to be run. RETURNS ------- Performance A Performance instance. """ crossValidation = StratifiedKFoldCrossValidation( experiment.getDataSet().getClassInstances(), self.__K, experiment.getParameter().getSeed()) trainSet = InstanceList(crossValidation.getTrainFold(0)) testSet = InstanceList(crossValidation.getTestFold(0)) return experiment.getClassifier().singleRun(experiment.getParameter(), trainSet, testSet)
def train(self, trainSet: InstanceList, parameters: BaggingParameter): """ Bagging bootstrap ensemble method that creates individuals for its ensemble by training each classifier on a random redistribution of the training set. This training method is for a bagged decision tree classifier. 20 percent of the instances are left aside for pruning of the trees 80 percent of the instances are used for training the trees. The number of trees (forestSize) is a parameter, and basically the method will learn an ensemble of trees as a model. PARAMETERS ---------- trainSet : InstanceList Training data given to the algorithm. parameters : Parameter Parameters of the bagging trees algorithm. ensembleSize returns the number of trees in the bagged forest. """ forestSize = parameters.getEnsembleSize() forest = [] for i in range(forestSize): bootstrap = trainSet.bootstrap(i) tree = DecisionTree( DecisionNode(InstanceList(bootstrap.getSample()))) forest.append(tree) self.model = TreeEnsembleModel(forest)
def testAutoEncoder(self, data: InstanceList) -> Performance: """ The testAutoEncoder method takes an InstanceList as an input and tries to predict a value and finds the difference with the actual value for each item of that InstanceList. At the end, it returns an error rate by finding the mean of total errors. PARAMETERS ---------- data : InstanceList InstanceList to use as validation set. RETURNS ------- Performance Error rate by finding the mean of total errors. """ total = data.size() error = 0.0 for i in range(total): self.y = self.__predictInput(data.get(i)) self.r = data.get(i).toVector() error += self.r.difference(self.y).dotProductWithSelf() return Performance(error / total)
def __init__(self, trainSet: InstanceList, validationSet: InstanceList, parameters: MultiLayerPerceptronParameter): """ The AutoEncoderModel method takes two InstanceLists as inputs; train set and validation set. First it allocates the weights of W and V matrices using given MultiLayerPerceptronParameter and takes the clones of these matrices as the bestW and bestV. Then, it gets the epoch and starts to iterate over them. First it shuffles the train set and tries to find the new W and V matrices. At the end it tests the autoencoder with given validation set and if its performance is better than the previous one, it reassigns the bestW and bestV matrices. Continue to iterate with a lower learning rate till the end of an episode. PARAMETERS ---------- trainSet : InstanceList InstanceList to use as train set. validationSet : InstanceList InstanceList to use as validation set. parameters : MultiLayerPerceptronParameter MultiLayerPerceptronParameter is used to get the parameters. """ super().__init__(trainSet) self.K = trainSet.get(0).continuousAttributeSize() self.__allocateWeights(parameters.getHiddenNodes(), parameters.getSeed()) bestW = copy.deepcopy(self.__W) bestV = copy.deepcopy(self.__V) bestPerformance = Performance(1000000000) epoch = parameters.getEpoch() learningRate = parameters.getLearningRate() for i in range(epoch): trainSet.shuffle(parameters.getSeed()) for j in range(trainSet.size()): self.createInputVector(trainSet.get(j)) self.r = trainSet.get(j).toVector() hidden = self.calculateHidden(self.x, self.__W) hiddenBiased = hidden.biased() self.y = self.__V.multiplyWithVectorFromRight(hiddenBiased) rMinusY = self.r.difference(self.y) deltaV = Matrix(rMinusY, hiddenBiased) oneMinusHidden = self.calculateOneMinusHidden(hidden) tmph = self.__V.multiplyWithVectorFromLeft(rMinusY) tmph.remove(0) tmpHidden = oneMinusHidden.elementProduct( hidden.elementProduct(tmph)) deltaW = Matrix(tmpHidden, self.x) deltaV.multiplyWithConstant(learningRate) self.__V.add(deltaV) deltaW.multiplyWithConstant(learningRate) self.__W.add(deltaW) currentPerformance = self.testAutoEncoder(validationSet) if currentPerformance.getErrorRate( ) < bestPerformance.getErrorRate(): bestPerformance = currentPerformance bestW = copy.deepcopy(self.__W) bestV = copy.deepcopy(self.__V) self.__W = bestW self.__V = bestV
def execute(self, experiment: Experiment) -> ExperimentPerformance: """ Execute the bootstrap run with the given classifier on the given data set using the given parameters. PARAMETERS ---------- experiment : Experiment Experiment to be run. RETURNS ------- ExperimentPerformance An ExperimentPerformance instance. """ result = ExperimentPerformance() for i in range(self.__numberOfBootstraps): bootstrap = Bootstrap(experiment.getDataSet().getInstances(), i + experiment.getParameter().getSeed()) bootstrapSample = InstanceList(bootstrap.getSample()) experiment.getClassifier().train(bootstrapSample, experiment.getParameter()) result.add(experiment.getClassifier().test( experiment.getDataSet().getInstanceList())) return result
def __init__(self, trainSet: InstanceList, validationSet: InstanceList, parameters: DeepNetworkParameter): """ Constructor that takes two InstanceList train set and validation set and DeepNetworkParameter as inputs. First it sets the class labels, their sizes as K and the size of the continuous attributes as d of given train set and allocates weights and sets the best weights. At each epoch, it shuffles the train set and loops through the each item of that train set, it multiplies the weights Matrix with input Vector than applies the sigmoid function and stores the result as hidden and add bias. Then updates weights and at the end it compares the performance of these weights with validation set. It updates the bestClassificationPerformance and bestWeights according to the current situation. At the end it updates the learning rate via etaDecrease value and finishes with clearing the weights. PARAMETERS ---------- trainSet : InstanceList InstanceList to be used as trainSet. validationSet : InstanceList InstanceList to be used as validationSet. parameters : DeepNetworkParameter DeepNetworkParameter input. """ super().__init__(trainSet) deltaWeights = [] hidden = [] hiddenBiased = [] self.__allocateWeights(parameters) bestWeights = self.__setBestWeights() bestClassificationPerformance = ClassificationPerformance(0.0) epoch = parameters.getEpoch() learningRate = parameters.getLearningRate() for i in range(epoch): trainSet.shuffle(parameters.getSeed()) for j in range(trainSet.size()): self.createInputVector(trainSet.get(j)) hidden.clear() hiddenBiased.clear() deltaWeights.clear() for k in range(self.__hiddenLayerSize): if k == 0: hidden.append( self.calculateHidden(self.x, self.__weights[k])) else: hidden.append( self.calculateHidden(hiddenBiased[k - 1], self.__weights[k])) hiddenBiased.append(hidden[k].biased()) rMinusY = self.calculateRMinusY( trainSet.get(j), hiddenBiased[self.__hiddenLayerSize - 1], self.__weights[len(self.__weights) - 1]) deltaWeights.insert( 0, Matrix(rMinusY, hiddenBiased[self.__hiddenLayerSize - 1])) for k in range(len(self.__weights) - 2, -1, -1): oneMinusHidden = self.calculateOneMinusHidden(hidden[k]) tmph = deltaWeights[0].elementProduct( self.__weights[k + 1]).sumOfRows() tmph.remove(0) tmpHidden = oneMinusHidden.elementProduct(tmph) if k == 0: deltaWeights.insert(0, Matrix(tmpHidden, self.x)) else: deltaWeights.insert( 0, Matrix(tmpHidden, hiddenBiased[k - 1])) for k in range(len(self.__weights)): deltaWeights[k].multiplyWithConstant(learningRate) self.__weights[k].add(deltaWeights[k]) currentClassificationPerformance = self.testClassifier( validationSet) if currentClassificationPerformance.getAccuracy( ) > bestClassificationPerformance.getAccuracy(): bestClassificationPerformance = currentClassificationPerformance bestWeights = self.__setBestWeights() learningRate *= parameters.getEtaDecrease() self.__weights.clear() for m in bestWeights: self.__weights.append(m)
def __init__(self, instanceList: InstanceList = None, ratio=None, seed=None, stratified: bool = None): """ Divides the instances in the instance list into partitions so that all instances of a class are grouped in a single partition. PARAMETERS ---------- ratio Ratio of the stratified partition. Ratio is between 0 and 1. If the ratio is 0.2, then 20 percent of the instances are put in the first group, 80 percent of the instances are put in the second group. seed seed is used as a random number. """ self.__multilist = [] if instanceList is not None: if ratio is None: classLabels = instanceList.getDistinctClassLabels() for classLabel in classLabels: self.add(InstanceListOfSameClass(classLabel)) for instance in instanceList.getInstances(): self.get(classLabels.index(instance.getClassLabel())).add(instance) else: if isinstance(ratio, float): self.add(InstanceList()) self.add(InstanceList()) if stratified: distribution = instanceList.classDistribution() counts = [0] * len(distribution) randomArray = [i for i in range(instanceList.size())] random.seed(seed) random.shuffle(randomArray) for i in range(instanceList.size()): instance = instanceList.get(randomArray[i]) classIndex = distribution.getIndex(instance.getClassLabel()) if counts[classIndex] < instanceList.size() * ratio * \ distribution.getProbability(instance.getClassLabel()): self.get(0).add(instance) else: self.get(1).add(instance) counts[classIndex] = counts[classIndex] + 1 else: instanceList.shuffle(seed) for i in range(self.size()): instance = instanceList.get(i) if i < instanceList.size() * ratio: self.get(0).add(instance) else: self.get(1).add(instance) elif isinstance(ratio, int): attributeIndex = ratio if seed is None: valueList = instanceList.getAttributeValueList(attributeIndex) for _ in valueList: self.add(InstanceList()) for instance in instanceList.getInstances(): self.get(valueList.index(instance.getAttribute(attributeIndex).getValue())).add(instance) elif isinstance(seed, int): attributeValue = seed self.add(InstanceList()) self.add(InstanceList()) for instance in instanceList.getInstances(): if instance.getAttribute(attributeIndex).getIndex() == attributeValue: self.get(0).add(instance) else: self.get(1).add(instance) elif isinstance(seed, float): splitValue = seed self.add(InstanceList()) self.add(InstanceList()) for instance in instanceList.getInstances(): if instance.getAttribute(attributeIndex).getValue() < splitValue: self.get(0).add(instance) else: self.get(1).add(instance)
def runExperiment(self, classifier: Classifier, parameter: Parameter, crossValidation: CrossValidation): trainSet = InstanceList(crossValidation.getTrainFold(0)) testSet = InstanceList(crossValidation.getTestFold(0)) return classifier.singleRun(parameter, trainSet, testSet)
class DataSet(object): __instances: InstanceList __definition: DataDefinition def __init__(self, definition: DataDefinition = None, separator: str = None, fileName: str = None): """ Constructor for generating a new DataSet with given DataDefinition. PARAMETERS ---------- definition : DataDefinition Data definition of the data set. separator : str Separator character which separates the attribute values in the data file. fileName : str Name of the data set file. """ self.__definition = definition if separator is None: self.__instances = InstanceList() else: self.__instances = InstanceList(definition, separator, fileName) def initWithFile(self, fileName: str): """ Constructor for generating a new DataSet from given File. PARAMETERS ---------- fileName : str File to generate DataSet from. """ self.__instances = InstanceList() self.__definition = DataDefinition() inputFile = open(fileName, 'r', encoding='utf8') lines = inputFile.readlines() i = 0 for line in lines: attributes = line.split(",") if i == 0: for j in range(len(attributes) - 1): try: float(attributes[j]) self.__definition.addAttribute( AttributeType.CONTINUOUS) except: self.__definition.addAttribute(AttributeType.DISCRETE) else: if len(attributes) != self.__definition.attributeCount() + 1: continue if ";" not in attributes[len(attributes) - 1]: instance = Instance(attributes[len(attributes) - 1]) else: labels = attributes[len(attributes) - 1].split(";") instance = CompositeInstance(labels[0], None, labels) for j in range(len(attributes) - 1): if self.__definition.getAttributeType( j) is AttributeType.CONTINUOUS: instance.addAttribute( ContinuousAttribute(float(attributes[j]))) elif self.__definition.getAttributeType( j) is AttributeType.DISCRETE: instance.addAttribute(DiscreteAttribute(attributes[j])) if instance.attributeSize() == self.__definition.attributeCount(): self.__instances.add(instance) i = i + 1 def __checkDefinition(self, instance: Instance) -> bool: """ Checks the correctness of the attribute type, for instance, if the attribute of given instance is a Binary attribute, and the attribute type of the corresponding item of the data definition is also a Binary attribute, it then returns true, and false otherwise. PARAMETERS ---------- instance : Instance Instance to checks the attribute type. RETURNS ------- bool true if attribute types of given Instance and data definition matches. """ for i in range(instance.attributeSize()): if isinstance(instance.getAttribute(i), BinaryAttribute): if self.__definition.getAttributeType( i) is not AttributeType.BINARY: return False elif isinstance(instance.getAttribute(i), DiscreteIndexedAttribute): if self.__definition.getAttributeType( i) is not AttributeType.DISCRETE_INDEXED: return False elif isinstance(instance.getAttribute(i), DiscreteAttribute): if self.__definition.getAttributeType( i) is not AttributeType.DISCRETE: return False elif isinstance(instance.getAttribute(i), ContinuousAttribute): if self.__definition.getAttributeType( i) is not AttributeType.CONTINUOUS: return False return True def __setDefinition(self, instance: Instance): """ Adds the attribute types according to given Instance. For instance, if the attribute type of given Instance is a Discrete type, it than adds a discrete attribute type to the list of attribute types. PARAMETERS ---------- instance : Instance Instance input. """ attributeTypes = [] for i in range(instance.attributeSize()): if isinstance(instance.getAttribute(i), BinaryAttribute): attributeTypes.append(AttributeType.BINARY) elif isinstance(instance.getAttribute(i), DiscreteIndexedAttribute): attributeTypes.append(AttributeType.DISCRETE_INDEXED) elif isinstance(instance.getAttribute(i), DiscreteAttribute): attributeTypes.append(AttributeType.DISCRETE) elif isinstance(instance.getAttribute(i), ContinuousAttribute): attributeTypes.append(AttributeType.CONTINUOUS) self.__definition = DataDefinition(attributeTypes) def sampleSize(self) -> int: """ Returns the size of the InstanceList. RETURNS ------- int Size of the InstanceList. """ return self.__instances.size() def classCount(self) -> int: """ Returns the size of the class label distribution of InstanceList. RETURNS ------- int Size of the class label distribution of InstanceList. """ return len(self.__instances.classDistribution()) def attributeCount(self) -> int: """ Returns the number of attribute types at DataDefinition list. RETURNS ------- int The number of attribute types at DataDefinition list. """ return self.__definition.attributeCount() def discreteAttributeCount(self) -> int: """ Returns the number of discrete attribute types at DataDefinition list. RETURNS ------- int The number of discrete attribute types at DataDefinition list. """ return self.__definition.discreteAttributeCount() def continuousAttributeCount(self) -> int: """ Returns the number of continuous attribute types at DataDefinition list. RETURNS ------- int The number of continuous attribute types at DataDefinition list. """ return self.__definition.continuousAttributeCount() def getClasses(self) -> str: """ Returns the accumulated String of class labels of the InstanceList. RETURNS ------- str The accumulated String of class labels of the InstanceList. """ classLabels = self.__instances.getDistinctClassLabels() result = classLabels[0] for i in range(1, len(classLabels)): result = result + ";" + classLabels[i] return result def info(self, dataSetName: str) -> str: """ Returns the general information about the given data set such as the number of instances, distinct class labels, attributes, discrete and continuous attributes. PARAMETERS ---------- dataSetName : str Data set name. RETURNS ------- str General information about the given data set. """ result = "DATASET: " + dataSetName + "\n" result = result + "Number of instances: " + self.sampleSize().__str__( ) + "\n" result = result + "Number of distinct class labels: " + self.classCount( ).__str__() + "\n" result = result + "Number of attributes: " + self.attributeCount( ).__str__() + "\n" result = result + "Number of discrete attributes: " + self.discreteAttributeCount( ).__str__() + "\n" result = result + "Number of continuous attributes: " + self.continuousAttributeCount( ).__str__() + "\n" result = result + "Class labels: " + self.getClasses() return result def addInstance(self, current: Instance): """ Adds a new instance to the InstanceList. PARAMETERS ---------- current : Instance Instance to add. """ if self.__definition is None: self.__setDefinition(current) self.__instances.add(current) elif self.__checkDefinition(current): self.__instances.add(current) def addInstanceList(self, instanceList: list): """ Adds all the instances of given instance list to the InstanceList. PARAMETERS ---------- instanceList : list InstanceList to add instances from. """ for instance in instanceList: self.addInstance(instance) def getInstances(self) -> list: """ Returns the instances of InstanceList. RETURNS ------- list The instances of InstanceList. """ return self.__instances.getInstances() def getClassInstances(self) -> list: """ Returns instances of the items at the list of instance lists from the partitions. RETURNS ------- list Instances of the items at the list of instance lists from the partitions. """ return Partition(self.__instances).getLists() def getInstanceList(self) -> InstanceList: """ Accessor for the InstanceList. RETURNS ------- InstanceList The InstanceList. """ return self.__instances def getDataDefinition(self) -> DataDefinition: """ Accessor for the data definition. RETURNS ------- DataDefinition The data definition. """ return self.__definition def getSubSetOfFeatures(self, featureSubSet: FeatureSubSet) -> DataSet: """ Return a subset generated via the given FeatureSubSet. PARAMETERS ---------- featureSubSet : FeatureSubSet FeatureSubSet input. RETURNS ------- FeatureSubSet Subset generated via the given FeatureSubSet. """ result = DataSet(self.__definition.getSubSetOfFeatures(featureSubSet)) for i in range(self.__instances.size()): result.addInstance( self.__instances.get(i).getSubSetOfFeatures(featureSubSet)) return result def writeToFile(self, outFileName: str): """ Print out the instances of InstanceList as a String. PARAMETERS ---------- outFileName : str File name to write the output. """ outfile = open(outFileName, "w") for i in range(self.__instances.size()): outfile.write(self.__instances.get(i).__str__() + "\n") outfile.close()