class DataSet(object): __instances: InstanceList __definition: DataDefinition def __init__(self, definition: DataDefinition = None, separator: str = None, fileName: str = None): """ Constructor for generating a new DataSet with given DataDefinition. PARAMETERS ---------- definition : DataDefinition Data definition of the data set. separator : str Separator character which separates the attribute values in the data file. fileName : str Name of the data set file. """ self.__definition = definition if separator is None: self.__instances = InstanceList() else: self.__instances = InstanceList(definition, separator, fileName) def initWithFile(self, fileName: str): """ Constructor for generating a new DataSet from given File. PARAMETERS ---------- fileName : str File to generate DataSet from. """ self.__instances = InstanceList() self.__definition = DataDefinition() inputFile = open(fileName, 'r', encoding='utf8') lines = inputFile.readlines() i = 0 for line in lines: attributes = line.split(",") if i == 0: for j in range(len(attributes) - 1): try: float(attributes[j]) self.__definition.addAttribute( AttributeType.CONTINUOUS) except: self.__definition.addAttribute(AttributeType.DISCRETE) else: if len(attributes) != self.__definition.attributeCount() + 1: continue if ";" not in attributes[len(attributes) - 1]: instance = Instance(attributes[len(attributes) - 1]) else: labels = attributes[len(attributes) - 1].split(";") instance = CompositeInstance(labels[0], None, labels) for j in range(len(attributes) - 1): if self.__definition.getAttributeType( j) is AttributeType.CONTINUOUS: instance.addAttribute( ContinuousAttribute(float(attributes[j]))) elif self.__definition.getAttributeType( j) is AttributeType.DISCRETE: instance.addAttribute(DiscreteAttribute(attributes[j])) if instance.attributeSize() == self.__definition.attributeCount(): self.__instances.add(instance) i = i + 1 def __checkDefinition(self, instance: Instance) -> bool: """ Checks the correctness of the attribute type, for instance, if the attribute of given instance is a Binary attribute, and the attribute type of the corresponding item of the data definition is also a Binary attribute, it then returns true, and false otherwise. PARAMETERS ---------- instance : Instance Instance to checks the attribute type. RETURNS ------- bool true if attribute types of given Instance and data definition matches. """ for i in range(instance.attributeSize()): if isinstance(instance.getAttribute(i), BinaryAttribute): if self.__definition.getAttributeType( i) is not AttributeType.BINARY: return False elif isinstance(instance.getAttribute(i), DiscreteIndexedAttribute): if self.__definition.getAttributeType( i) is not AttributeType.DISCRETE_INDEXED: return False elif isinstance(instance.getAttribute(i), DiscreteAttribute): if self.__definition.getAttributeType( i) is not AttributeType.DISCRETE: return False elif isinstance(instance.getAttribute(i), ContinuousAttribute): if self.__definition.getAttributeType( i) is not AttributeType.CONTINUOUS: return False return True def __setDefinition(self, instance: Instance): """ Adds the attribute types according to given Instance. For instance, if the attribute type of given Instance is a Discrete type, it than adds a discrete attribute type to the list of attribute types. PARAMETERS ---------- instance : Instance Instance input. """ attributeTypes = [] for i in range(instance.attributeSize()): if isinstance(instance.getAttribute(i), BinaryAttribute): attributeTypes.append(AttributeType.BINARY) elif isinstance(instance.getAttribute(i), DiscreteIndexedAttribute): attributeTypes.append(AttributeType.DISCRETE_INDEXED) elif isinstance(instance.getAttribute(i), DiscreteAttribute): attributeTypes.append(AttributeType.DISCRETE) elif isinstance(instance.getAttribute(i), ContinuousAttribute): attributeTypes.append(AttributeType.CONTINUOUS) self.__definition = DataDefinition(attributeTypes) def sampleSize(self) -> int: """ Returns the size of the InstanceList. RETURNS ------- int Size of the InstanceList. """ return self.__instances.size() def classCount(self) -> int: """ Returns the size of the class label distribution of InstanceList. RETURNS ------- int Size of the class label distribution of InstanceList. """ return len(self.__instances.classDistribution()) def attributeCount(self) -> int: """ Returns the number of attribute types at DataDefinition list. RETURNS ------- int The number of attribute types at DataDefinition list. """ return self.__definition.attributeCount() def discreteAttributeCount(self) -> int: """ Returns the number of discrete attribute types at DataDefinition list. RETURNS ------- int The number of discrete attribute types at DataDefinition list. """ return self.__definition.discreteAttributeCount() def continuousAttributeCount(self) -> int: """ Returns the number of continuous attribute types at DataDefinition list. RETURNS ------- int The number of continuous attribute types at DataDefinition list. """ return self.__definition.continuousAttributeCount() def getClasses(self) -> str: """ Returns the accumulated String of class labels of the InstanceList. RETURNS ------- str The accumulated String of class labels of the InstanceList. """ classLabels = self.__instances.getDistinctClassLabels() result = classLabels[0] for i in range(1, len(classLabels)): result = result + ";" + classLabels[i] return result def info(self, dataSetName: str) -> str: """ Returns the general information about the given data set such as the number of instances, distinct class labels, attributes, discrete and continuous attributes. PARAMETERS ---------- dataSetName : str Data set name. RETURNS ------- str General information about the given data set. """ result = "DATASET: " + dataSetName + "\n" result = result + "Number of instances: " + self.sampleSize().__str__( ) + "\n" result = result + "Number of distinct class labels: " + self.classCount( ).__str__() + "\n" result = result + "Number of attributes: " + self.attributeCount( ).__str__() + "\n" result = result + "Number of discrete attributes: " + self.discreteAttributeCount( ).__str__() + "\n" result = result + "Number of continuous attributes: " + self.continuousAttributeCount( ).__str__() + "\n" result = result + "Class labels: " + self.getClasses() return result def addInstance(self, current: Instance): """ Adds a new instance to the InstanceList. PARAMETERS ---------- current : Instance Instance to add. """ if self.__definition is None: self.__setDefinition(current) self.__instances.add(current) elif self.__checkDefinition(current): self.__instances.add(current) def addInstanceList(self, instanceList: list): """ Adds all the instances of given instance list to the InstanceList. PARAMETERS ---------- instanceList : list InstanceList to add instances from. """ for instance in instanceList: self.addInstance(instance) def getInstances(self) -> list: """ Returns the instances of InstanceList. RETURNS ------- list The instances of InstanceList. """ return self.__instances.getInstances() def getClassInstances(self) -> list: """ Returns instances of the items at the list of instance lists from the partitions. RETURNS ------- list Instances of the items at the list of instance lists from the partitions. """ return Partition(self.__instances).getLists() def getInstanceList(self) -> InstanceList: """ Accessor for the InstanceList. RETURNS ------- InstanceList The InstanceList. """ return self.__instances def getDataDefinition(self) -> DataDefinition: """ Accessor for the data definition. RETURNS ------- DataDefinition The data definition. """ return self.__definition def getSubSetOfFeatures(self, featureSubSet: FeatureSubSet) -> DataSet: """ Return a subset generated via the given FeatureSubSet. PARAMETERS ---------- featureSubSet : FeatureSubSet FeatureSubSet input. RETURNS ------- FeatureSubSet Subset generated via the given FeatureSubSet. """ result = DataSet(self.__definition.getSubSetOfFeatures(featureSubSet)) for i in range(self.__instances.size()): result.addInstance( self.__instances.get(i).getSubSetOfFeatures(featureSubSet)) return result def writeToFile(self, outFileName: str): """ Print out the instances of InstanceList as a String. PARAMETERS ---------- outFileName : str File name to write the output. """ outfile = open(outFileName, "w") for i in range(self.__instances.size()): outfile.write(self.__instances.get(i).__str__() + "\n") outfile.close()
def __init__(self, instanceList: InstanceList = None, ratio=None, seed=None, stratified: bool = None): """ Divides the instances in the instance list into partitions so that all instances of a class are grouped in a single partition. PARAMETERS ---------- ratio Ratio of the stratified partition. Ratio is between 0 and 1. If the ratio is 0.2, then 20 percent of the instances are put in the first group, 80 percent of the instances are put in the second group. seed seed is used as a random number. """ self.__multilist = [] if instanceList is not None: if ratio is None: classLabels = instanceList.getDistinctClassLabels() for classLabel in classLabels: self.add(InstanceListOfSameClass(classLabel)) for instance in instanceList.getInstances(): self.get(classLabels.index(instance.getClassLabel())).add(instance) else: if isinstance(ratio, float): self.add(InstanceList()) self.add(InstanceList()) if stratified: distribution = instanceList.classDistribution() counts = [0] * len(distribution) randomArray = [i for i in range(instanceList.size())] random.seed(seed) random.shuffle(randomArray) for i in range(instanceList.size()): instance = instanceList.get(randomArray[i]) classIndex = distribution.getIndex(instance.getClassLabel()) if counts[classIndex] < instanceList.size() * ratio * \ distribution.getProbability(instance.getClassLabel()): self.get(0).add(instance) else: self.get(1).add(instance) counts[classIndex] = counts[classIndex] + 1 else: instanceList.shuffle(seed) for i in range(self.size()): instance = instanceList.get(i) if i < instanceList.size() * ratio: self.get(0).add(instance) else: self.get(1).add(instance) elif isinstance(ratio, int): attributeIndex = ratio if seed is None: valueList = instanceList.getAttributeValueList(attributeIndex) for _ in valueList: self.add(InstanceList()) for instance in instanceList.getInstances(): self.get(valueList.index(instance.getAttribute(attributeIndex).getValue())).add(instance) elif isinstance(seed, int): attributeValue = seed self.add(InstanceList()) self.add(InstanceList()) for instance in instanceList.getInstances(): if instance.getAttribute(attributeIndex).getIndex() == attributeValue: self.get(0).add(instance) else: self.get(1).add(instance) elif isinstance(seed, float): splitValue = seed self.add(InstanceList()) self.add(InstanceList()) for instance in instanceList.getInstances(): if instance.getAttribute(attributeIndex).getValue() < splitValue: self.get(0).add(instance) else: self.get(1).add(instance)