def constructFromFile(self, fileName, **args) : if 'data' not in args : raise ValueError, 'missing data object' self._data = args['data'] patternIDdict = misc.list2dict(self._data.labels.patternID, range(len(self._data))) labels = Labels(fileName) patterns = [] pairs = [] for i in range(len(labels)) : p1,p2 = labels.patternID[i].split('_') # add only pairs for which we have kernel data: if p1 in patternIDdict and p2 in patternIDdict : pairs.append((patternIDdict[p1],patternIDdict[p2])) patterns.append(i) else : print p1, ' or ', p2, 'not found' labels = labels.__class__(labels, patterns = patterns) self.pairs = pairs first = [pair[0] for pair in pairs] second = [pair[1] for pair in pairs] firstVector = arrayWrap.intVector([pair[0] for pair in pairs]) secondVector = arrayWrap.intVector([pair[1] for pair in pairs]) self.callConstructor(firstVector, secondVector) WrapperDataSet.attachLabels(self, labels)
def constructFromFile(self, fileName): patternIDdict = misc.list2dict(self._data.labels.patternID, range(len(self._data))) labels = Labels(fileName) patterns = [] pairs = [] for i in range(len(labels)): p1, p2 = labels.patternID[i].split('_') # add only pairs for which we have kernel data: if p1 in patternIDdict and p2 in patternIDdict: pairs.append((patternIDdict[p1], patternIDdict[p2])) patterns.append(i) else: print p1, ' or ', p2, 'not found' labels = labels.__class__(labels, patterns=patterns) self.pairs = pairs first = [pair[0] for pair in pairs] second = [pair[1] for pair in pairs] firstVector = arrayWrap.intVector([pair[0] for pair in pairs]) secondVector = arrayWrap.intVector([pair[1] for pair in pairs]) self.callConstructor(firstVector, secondVector) WrapperDataSet.attachLabels(self, labels)
def fromArray(self, X, **args): L = None patternID = None self.featureID = None if 'labels' in args: L = args['labels'].L[:] patternID = args['labels'].patternID[:] if 'L' in args: L = args['L'] if 'patternID' in args: patternID = args['patternID'][:] if 'featureID' in args: if self.__class__.__name__ == 'SparseDataSet': raise vluaeError, 'cannot set feature ID for SparseDataSet' self.featureID = args['featureID'][:] if L is not None: assert len(X) == len(L) if self.featureID is None: self.featureID = [str(i) for i in range(len(X[0]))] if patternID is None: patternID = [str(i) for i in range(1, len(X) + 1)] self.fromArrayAdd(X) self.updateFeatureDict() self.featureIDcompute() if 'labelsFile' in args: self.attachLabels(Labels(args['labelsFile'], **args)) else: args['patternID'] = patternID self.attachLabels(Labels(L, **args))
def constructFromFile(self, fileName): patternIDdict = misc.list2dict(self._data.labels.patternID, range(len(self._data))) labels = Labels(fileName) patterns = [] pairs = [] for i in range(len(labels)): p1, p2 = labels.patternID[i].split("_") # add only pairs for which we have kernel data: if p1 in patternIDdict and p2 in patternIDdict: pairs.append((patternIDdict[p1], patternIDdict[p2])) patterns.append(i) else: print p1, " or ", p2, "not found" labels = labels.__class__(labels, patterns=patterns) self.pairs = pairs first = [pair[0] for pair in pairs] second = [pair[1] for pair in pairs] firstVector = arrayWrap.intVector([pair[0] for pair in pairs]) secondVector = arrayWrap.intVector([pair[1] for pair in pairs]) self.callConstructor(firstVector, secondVector) WrapperDataSet.attachLabels(self, labels)
def constructFromFile(self, fileName): delim = ',' if self.data is not None: patternIDdict = misc.list2dict(self.data.labels.patternID, range(len(self.data))) else: patternIDdict = {} L = [] patternID = [] pairs = [] file = open(fileName) for line in file: tokens = line[:-1].split(delim) #patternID.append(tokens[0]) p1, p2 = tokens[0].split('_') if p1 > p2: p1, p2 = p2, p1 # add only pairs for which we have kernel data: if p1 in patternIDdict and p2 in patternIDdict or self.data is None: pairs.append((p1, p2)) L.append(tokens[1]) patternID.append('_'.join([p1, p2])) else: print p1, ' or ', p2, 'not found' self.pairs = pairs self.labels = Labels(L, patternID=patternID)
def pick(event): global data global X global Y global numpy_container if event.key == 'q': if len(X) == 0: return if not numpy_container: data = VectorDataSet(X) else: data = PyVectorDataSet(numpy.array(X)) data.attachLabels(Labels(Y)) X = [] Y = [] print 'done creating data. close this window and use the decisionSurface function' pylab.disconnect(binding_id) if event.key == '1' or event.key == '2': if event.inaxes is not None: print 'data coords', event.xdata, event.ydata X.append([event.xdata, event.ydata]) Y.append(event.key) pylab.plot([event.xdata], [event.ydata], plotStr[int(event.key) - 1]) pylab.draw()
def copyConstruct(self, other, **args): forgetClassLabels = False if "patterns" in args: patterns = args['patterns'] # if the patterns are ids (strings) convert them to indices: if type(patterns[0]) == type(''): idDict = misc.list2dict(patterns) patternsToCopy = [ i for i in range(len(other)) if other.labels.patternID[i] in idDict ] else: patternsToCopy = patterns elif "classes" in args: patternsToCopy = [ i for i in range(len(other)) if other.labels.L[i] in args["classes"] ] forgetClassLabels = True elif "classID" in args: patternsToCopy = [ i for i in range(len(other)) if other.labels.Y[i] in args["classID"] ] forgetClassLabels = True else: patternsToCopy = range(len(other)) self.setTrainingFunc(other.trainingFunc) self.setTestingFunc(other.testingFunc) # class dependent copying of data: self.copy(other, patternsToCopy) self.attachKernel(other) self.attachLabels( Labels(other.labels, patterns=patternsToCopy, forgetClassLabels=forgetClassLabels)) # copy the registered attribute: if hasattr(other, '_registeredAttributes'): self._registeredAttributes = other._registeredAttributes[:] self._actions = copy.deepcopy(other._actions) for attr in self._registeredAttributes: a = getattr(other, attr) if type(a) == type([]): if len(a) != len(other): raise ValueError, 'attribute has bad length' #BaseDataSet.__setattr__(self, attr, # [a[i] for i in patternsToCopy]) setattr(self, attr, [a[i] for i in patternsToCopy]) elif hasattr(a, 'type') and a.type == 'dataset' and len( a) == len(self): acopy = a.__class__(a, patterns=patternsToCopy) setattr(self, attr, acopy) else: setattr(self, attr, a)
def load_libsvm_format(file_name, **args): """ Load a dataset from a file in libsvm format returns an instance of PyVectorDataSet If you want to use the data with a SparseDataSet, you can directly do it using the SparseDataSet constructor. """ regression = False if 'regression' in args: regression = args['regression'] # first extract labels and check how many features there are: labels = [] num_features = 0 if not os.path.exists(file_name): raise ValueError, "file doesn't exist at %s" % file_name file_handle = myio.myopen(file_name) for line in file_handle: tokens = line.split() if regression: labels.append(float(tokens[0])) else: labels.append(str(int(float(tokens[0])))) for token in tokens[1:]: id, value = token.split(':') num_features = max(num_features, int(id)) X = numpy.zeros((len(labels), num_features), numpy.float) # fill in the array: i = 0 for line in open(file_name): tokens = line.split() for token in tokens[1:]: id, value = token.split(':') id = int(id) - 1 X[i][id] = float(value) i += 1 data = PyVectorDataSet(X) if regression: labels = Labels(labels, numericLabels=True) else: labels = Labels(labels) data.attachLabels(labels) return data
def attachLabels(self, labels): if labels.__class__.__name__ == 'Labels': pass elif type(labels) == type(''): labels = Labels(labels) else: raise ValueError, 'wrong type of labels object' if len(self) != len(labels): raise ValueError, 'length of labels not equal length of self' self.labels = labels
def constructFromFile(self, fileName, **args): parser = parsers.parserDispatcher(fileName, **args) # the DataSet container can only be used with a csv type file: if parser.__class__.__name__ == 'SparseParser' and \ self.__class__.__name__ == 'DataSet' : raise ValueError, \ 'cannot use a DataSet container with a sparse file' parser.scan() self.initializeDataMatrix(len(parser), len(parser._featureID)) # read the patterns : i = 0 for x in parser: self.addPattern(x, i) i += 1 # if i % 100 == 0 : # print 'read',i,'patterns' # postprocessing: L = parser._labels patternID = parser._patternID if patternID is None or len(patternID) == 0: patternID = [str(i) for i in range(1, len(self) + 1)] self.featureID, featureKey, featureKeyDict = parser.postProcess() if self.__class__.__name__ == 'PySparseDataSet': self.featureKey = featureKey self.featureKeyDict = featureKeyDict self.updateFeatureDict() self.featureIDcompute() # print 'read', len(self), 'patterns' if 'labelsFile' in args: self.attachLabels(Labels(args['labelsFile'], **args)) else: self.attachLabels(Labels(L, patternID=patternID, **args))
def makeEmpty(self, size, **args): L = None patternID = None if 'labels' in args: L = args['labels'].L[:] patternID = args['labels'].patternID[:] if 'L' in args: L = args['L'] if 'patternID' in args: patternID = args['patternID'][:] if L is not None: assert size == len(L) if patternID is None: patternID = [str(i) for i in range(1, size + 1)] self.initializeDataMatrix(size, 0) if 'labelsFile' in args: self.attachLabels(Labels(args['labelsFile'], **args)) else: args['patternID'] = patternID self.attachLabels(Labels(L, **args))
def constructFromFile(self, fileName, **args): print 'reading from', fileName headerHandler = fastaHeaderHandler if 'headerHandler' in args: headerHandler = args['headerHandler'] numPatterns = fasta.fasta_count(fileName) self.container.__init__(self, numPatterns) patternIDs = [] L = [] for record in fasta.fasta_itr(fileName): self.addPattern(record.sequence) patternID, label = headerHandler(record.header) patternIDs.append(patternID) if label is not None: L.append(label) self.attachLabels(Labels(L, patternID=patternIDs, **args))
def constructFromFile(self, file_name, **args) : if 'data' not in args : raise ValueError, 'missing data object' self._data = args['data'] id_dict = misc.list2dict(self._data.labels.patternID, range(len(self._data))) file_handle = open(file_name) L = [] sets = [] for line in file_handle : tokens = line.split() sets.append([id_dict[token] for token in tokens[:-1] ]) L.append(tokens[-1]) self.n = len(sets) self.callConstructor(len(sets)) for s in sets : self.add(tuple(s)) labels = Labels(L) WrapperDataSet.attachLabels(self, labels)
def copyConstruct(self, other, patterns): self.pairs = [other.pairs[p] for p in patterns] self.data = other.data self.labels = Labels(other.labels, patterns=patterns)