Пример #1
0
    def __init__(self, data) :

        if type(data) == type('') :
            print 'file name:', data            
            data = datafunc.PyVectorDataSet(data, idColumn = 0, headerRow = True, hint = 'csv')

        self.data = data
        self.idDict = misc.list2dict(data.labels.patternID,
                                     range(len(data)))

        print numpy.shape(data.X)
        self.mean = numpy.mean(data.X, 1)
        self.std = std(data.X, 1)
        eps = 1e-5
        I = numpy.nonzero(numpy.less(self.std, eps))[0]
        print 'num zeros:',len(I)
        numpy.put(self.std, I, 1)
        
        self.numCorrelations = 10000
        correlations = numpy.zeros(self.numCorrelations, numpy.float)
        
        for i in range(self.numCorrelations) :
            i1 = random.randrange(0, len(data))
            i2 = random.randrange(0, len(data))
            correlations[i] = self._corrcoef(i1, i2)
        self.meanCorrelation = numpy.mean(correlations)
        self.numCorrelations = 1000        
Пример #2
0
def sortKernel2(kernelInFile, kernelOutFile, ids, format = 'gist', **args) :
  """
  sort a kernel matrix according to the given list of ids

  :Parameters:
    - `kernelInFile` - the kernel input file name
    - `kernelOutFile` - the output file name
    - `format` - whether to output the kernel in gist format

  :Keywords:
    - `delim` - the field delimiter (default = tab)
  """
  
  from PyML.containers import KernelData
  kdata = KernelData(kernelInFile)
  K = kdata.getKernelMatrix()
  idDict = misc.list2dict(ids, range(len(ids)))

  delim = '\t'
  if 'delim' in args :
      delim = args['delim']
  kernelFile = open(kernelOutFile, 'w')
  if format == 'gist' :
    kernelFile.write(kernelOutFile + delim + delim.join(ids) + '\n')

  for id1 in ids :
    kernelFile.write(id1 + delim)
    tokens = [str(K[idDict[id1]][idDict[id2]]) for id2 in ids]
    kernelFile.write(delim.join(tokens) + '\n')
Пример #3
0
    def __init__(self, data):

        if type(data) == type(''):
            print 'file name:', data
            data = datafunc.PyVectorDataSet(data,
                                            idColumn=0,
                                            headerRow=True,
                                            hint='csv')

        self.data = data
        self.idDict = misc.list2dict(data.labels.patternID, range(len(data)))

        print numpy.shape(data.X)
        self.mean = numpy.mean(data.X, 1)
        self.std = std(data.X, 1)
        eps = 1e-5
        I = numpy.nonzero(numpy.less(self.std, eps))[0]
        print 'num zeros:', len(I)
        numpy.put(self.std, I, 1)

        self.numCorrelations = 10000
        correlations = numpy.zeros(self.numCorrelations, numpy.float)

        for i in range(self.numCorrelations):
            i1 = random.randrange(0, len(data))
            i2 = random.randrange(0, len(data))
            correlations[i] = self._corrcoef(i1, i2)
        self.meanCorrelation = numpy.mean(correlations)
        self.numCorrelations = 1000
Пример #4
0
    def constructFromFile(self, fileName):

        patternIDdict = misc.list2dict(self._data.labels.patternID,
                                       range(len(self._data)))

        labels = Labels(fileName)
        patterns = []
        pairs = []
        for i in range(len(labels)):
            p1, p2 = labels.patternID[i].split('_')
            # add only pairs for which we have kernel data:
            if p1 in patternIDdict and p2 in patternIDdict:
                pairs.append((patternIDdict[p1], patternIDdict[p2]))
                patterns.append(i)
            else:
                print p1, ' or ', p2, 'not found'
        labels = labels.__class__(labels, patterns=patterns)

        self.pairs = pairs

        first = [pair[0] for pair in pairs]
        second = [pair[1] for pair in pairs]
        firstVector = arrayWrap.intVector([pair[0] for pair in pairs])
        secondVector = arrayWrap.intVector([pair[1] for pair in pairs])
        self.callConstructor(firstVector, secondVector)

        WrapperDataSet.attachLabels(self, labels)
Пример #5
0
    def copyConstruct(self, other, **args) :

        if not hasattr(other, 'decisionFunc') :
            raise AttributeError, 'not a valid results object'

        if 'patterns' in args :
            p = args['patterns']
            idDict = misc.list2dict(other.patternID, range(len(other.patternID)))
            patterns = [idDict[pattern] for pattern in p
                        if pattern in idDict]
        else :
            patterns = range(len(other.Y))

        self.patternID = [other.patternID[p] for p in patterns]
        self.L = [other.L[p] for p in patterns]
        self.Y = [other.Y[p] for p in patterns]
        self.decisionFunc = [other.decisionFunc[p] for p in patterns]
        self.givenY = [other.givenY[p] for p in patterns]
        self.givenL = [other.givenL[p] for p in patterns]        
        self.rocN = 50
        self.classLabels = copy.deepcopy(other.classLabels)
        self.numClasses = len(self.classLabels)
        self.info = other.info
        try :
            self.log = other.log
        except :
            pass
        self.computeStats()
Пример #6
0
    def constructFromFile(self, fileName, **args) :

        if 'data' not in args :
            raise ValueError, 'missing data object'
        self._data = args['data']
        patternIDdict = misc.list2dict(self._data.labels.patternID,
                                       range(len(self._data)))
        labels = Labels(fileName)
        patterns = []
        pairs = []
        for i in range(len(labels)) :
            p1,p2 = labels.patternID[i].split('_')
            # add only pairs for which we have kernel data:
            if p1 in patternIDdict and p2 in patternIDdict :
                pairs.append((patternIDdict[p1],patternIDdict[p2]))
                patterns.append(i)
            else :
                print p1, ' or ', p2, 'not found'
        labels = labels.__class__(labels, patterns = patterns)

        self.pairs = pairs

        first = [pair[0] for pair in pairs]
        second = [pair[1] for pair in pairs]
        firstVector = arrayWrap.intVector([pair[0] for pair in pairs])
        secondVector = arrayWrap.intVector([pair[1] for pair in pairs])            
        self.callConstructor(firstVector, secondVector)

        WrapperDataSet.attachLabels(self, labels)
Пример #7
0
def sortKernel(kernelInFile, kernelOutFile, format = 'gist', **args) :
  """
  sort a kernel matrix according to its pattern ID

  :Parameters:
    - `kernelInFile` - the kernel input file name
    - `kernelOutFile` - the output file name
    - `format` - whether to output the kernel in gist format

  :Keywords:
    - `delim` - the field delimiter (default = tab)
  """
  
  from PyML.containers import KernelData
  kdata = KernelData(kernelInFile)
  idDict = misc.list2dict(kdata.labels.patternID, range(len(kdata)))
  ids = kdata.labels.patternID[:]
  ids.sort()
  delim = '\t'
  if 'delim' in args :
    delim = args['delim']
  kernelFile = open(kernelOutFile, 'w')
  if format == 'gist' :
    kernelFile.write(kernelOutFile + delim + delim.join(ids) + '\n')

  for id1 in ids :
    kernelFile.write(id1 + delim)
    tokens = [str(kdata.kernel.eval(kdata, idDict[id1], idDict[id2]))
              for id2 in ids]
    kernelFile.write(delim.join(tokens) + '\n')
    def copyConstruct(self, other, **args) :

        if not hasattr(other, 'decisionFunc') :
            raise AttributeError, 'not a valid results object'

        if 'patterns' in args :
            p = args['patterns']
            idDict = misc.list2dict(other.patternID, range(len(other.patternID)))
            patterns = [idDict[pattern] for pattern in p
                        if pattern in idDict]
        else :
            patterns = range(len(other.Y))

        self.patternID = [other.patternID[p] for p in patterns]
        self.L = [other.L[p] for p in patterns]
        self.Y = [other.Y[p] for p in patterns]
        self.decisionFunc = [other.decisionFunc[p] for p in patterns]
        self.givenY = [other.givenY[p] for p in patterns]
        self.givenL = [other.givenL[p] for p in patterns]        
        self.rocN = 50
        self.classLabels = copy.deepcopy(other.classLabels)
        self.numClasses = len(self.classLabels)
        self.info = other.info
        try :
            self.log = other.log
        except :
            pass
        self.computeStats()
Пример #9
0
    def constructFromFile(self, fileName) :

        delim = ','
        if self.data is not None :
            patternIDdict = misc.list2dict(self.data.labels.patternID,
                                           range(len(self.data)))
        else :
            patternIDdict = {}
            
        L = []
        patternID = []
        pairs = []
        file = open(fileName)
        for line in file :
            tokens = line[:-1].split(delim)
            #patternID.append(tokens[0])
            p1,p2 = tokens[0].split('_')
            if p1 > p2 : p1,p2 = p2,p1
            # add only pairs for which we have kernel data:
            if p1 in patternIDdict and p2 in patternIDdict or self.data is None :
                pairs.append((p1,p2))
                L.append(tokens[1])
                patternID.append('_'.join([p1,p2]))
            else :
                print p1, ' or ', p2, 'not found'
        self.pairs = pairs
        self.labels = Labels(L, patternID = patternID)
Пример #10
0
    def constructFromFile(self, fileName):

        patternIDdict = misc.list2dict(self._data.labels.patternID, range(len(self._data)))

        labels = Labels(fileName)
        patterns = []
        pairs = []
        for i in range(len(labels)):
            p1, p2 = labels.patternID[i].split("_")
            # add only pairs for which we have kernel data:
            if p1 in patternIDdict and p2 in patternIDdict:
                pairs.append((patternIDdict[p1], patternIDdict[p2]))
                patterns.append(i)
            else:
                print p1, " or ", p2, "not found"
        labels = labels.__class__(labels, patterns=patterns)

        self.pairs = pairs

        first = [pair[0] for pair in pairs]
        second = [pair[1] for pair in pairs]
        firstVector = arrayWrap.intVector([pair[0] for pair in pairs])
        secondVector = arrayWrap.intVector([pair[1] for pair in pairs])
        self.callConstructor(firstVector, secondVector)

        WrapperDataSet.attachLabels(self, labels)
Пример #11
0
    def constructFromFile(self, fileName):

        delim = ','
        if self.data is not None:
            patternIDdict = misc.list2dict(self.data.labels.patternID,
                                           range(len(self.data)))
        else:
            patternIDdict = {}

        L = []
        patternID = []
        pairs = []
        file = open(fileName)
        for line in file:
            tokens = line[:-1].split(delim)
            #patternID.append(tokens[0])
            p1, p2 = tokens[0].split('_')
            if p1 > p2: p1, p2 = p2, p1
            # add only pairs for which we have kernel data:
            if p1 in patternIDdict and p2 in patternIDdict or self.data is None:
                pairs.append((p1, p2))
                L.append(tokens[1])
                patternID.append('_'.join([p1, p2]))
            else:
                print p1, ' or ', p2, 'not found'
        self.pairs = pairs
        self.labels = Labels(L, patternID=patternID)
Пример #12
0
    def copyConstruct(self, other, **args):

        forgetClassLabels = False
        if "patterns" in args:
            patterns = args['patterns']
            # if the patterns are ids (strings) convert them to indices:
            if type(patterns[0]) == type(''):
                idDict = misc.list2dict(patterns)
                patternsToCopy = [
                    i for i in range(len(other))
                    if other.labels.patternID[i] in idDict
                ]
            else:
                patternsToCopy = patterns
        elif "classes" in args:
            patternsToCopy = [
                i for i in range(len(other))
                if other.labels.L[i] in args["classes"]
            ]
            forgetClassLabels = True
        elif "classID" in args:
            patternsToCopy = [
                i for i in range(len(other))
                if other.labels.Y[i] in args["classID"]
            ]
            forgetClassLabels = True
        else:
            patternsToCopy = range(len(other))

        self.setTrainingFunc(other.trainingFunc)
        self.setTestingFunc(other.testingFunc)

        # class dependent copying of data:
        self.copy(other, patternsToCopy)

        self.attachKernel(other)
        self.attachLabels(
            Labels(other.labels,
                   patterns=patternsToCopy,
                   forgetClassLabels=forgetClassLabels))

        # copy the registered attribute:
        if hasattr(other, '_registeredAttributes'):
            self._registeredAttributes = other._registeredAttributes[:]
            self._actions = copy.deepcopy(other._actions)
            for attr in self._registeredAttributes:
                a = getattr(other, attr)
                if type(a) == type([]):
                    if len(a) != len(other):
                        raise ValueError, 'attribute has bad length'
                    #BaseDataSet.__setattr__(self, attr,
                    #                        [a[i] for i in patternsToCopy])
                    setattr(self, attr, [a[i] for i in patternsToCopy])
                elif hasattr(a, 'type') and a.type == 'dataset' and len(
                        a) == len(self):
                    acopy = a.__class__(a, patterns=patternsToCopy)
                    setattr(self, attr, acopy)
                else:
                    setattr(self, attr, a)
Пример #13
0
    def copyConstruct(self, other, **args) :

        forgetClassLabels = False
        if "patterns" in args:
            patterns = args['patterns']
            # if the patterns are ids (strings) convert them to indices:
            if type(patterns[0]) == type('') :
                idDict = misc.list2dict(patterns)
                patternsToCopy = [i for i in range(len(other))
                                  if other.labels.patternID[i] in idDict]
            else :
                patternsToCopy = patterns
        elif "classes" in args :
            patternsToCopy = [i for i in range(len(other))
                              if other.labels.L[i] in args["classes"]]
            forgetClassLabels = True
        elif "classID" in args :
            patternsToCopy = [i for i in range(len(other))
                              if other.labels.Y[i] in args["classID"]]
            forgetClassLabels = True
        else :
            patternsToCopy = range(len(other))

        self.setTrainingFunc(other.trainingFunc)
        self.setTestingFunc(other.testingFunc)

        deepcopy = True
        if 'deepcopy' in args : deepcopy = args['deepcopy']
        # class dependent copying of data:
        self.copy(other, patternsToCopy, deepcopy)

        self.attachKernel(other)
        self.attachLabels(Labels(other.labels,
                                 patterns = patternsToCopy,
                                 forgetClassLabels = forgetClassLabels))

        # copy the registered attribute:
        if hasattr(other, '_registeredAttributes') :
            self._registeredAttributes = other._registeredAttributes[:]
            self._actions = copy.deepcopy(other._actions)
            for attr in self._registeredAttributes :
                a = getattr(other, attr)
                if type(a) == type([]) :
                    if len(a) != len(other) :
                        raise ValueError, 'attribute has bad length'
                    #BaseDataSet.__setattr__(self, attr,
                    #                        [a[i] for i in patternsToCopy])
                    setattr(self, attr, [a[i] for i in patternsToCopy])
                elif hasattr(a, 'type') and a.type == 'dataset' and len(a) == len(self) :
                    acopy = a.__class__(a, patterns = patternsToCopy)
                    setattr(self, attr, acopy)
                else :
                    setattr(self, attr, a)
Пример #14
0
    def addFeature(self, id, values) :

        hashID = hash(id)
        if hashID in self.featureKeyDict :
            raise ValueError, 'feature already exists, or hash problem'
        for i in range(len(self)) :
            if values[i] != 0 :
                self.X[i][hashID] = values[i]
            
        # update the featureKey, featureID attributes:
        pos = numpy.searchsorted(self.featureKey, hashID)
        self.featureKey.insert(pos, hashID)
        self.featureID.insert(pos, id)
        self.featureKeyDict = misc.list2dict(self.featureKey, range(len(self.featureKey)))
Пример #15
0
    def addFeature(self, id, values):

        hashID = hash(id)
        if hashID in self.featureKeyDict:
            raise ValueError, 'feature already exists, or hash problem'
        for i in range(len(self)):
            if values[i] != 0:
                self.X[i][hashID] = values[i]

        # update the featureKey, featureID attributes:
        pos = numpy.searchsorted(self.featureKey, hashID)
        self.featureKey.insert(pos, hashID)
        self.featureID.insert(pos, id)
        self.featureKeyDict = misc.list2dict(self.featureKey,
                                             range(len(self.featureKey)))
Пример #16
0
def expandKernel(inKernelFile, referenceKernelFile, outKernelFile, **args) :

    """
    Given a kernel matrix that might have missing entries, fill those as 0
    on the basis of the patterns in a reference kernel (it is checked that
    the reference kernel is sorted).
    
    :Parameters:
      - `inKernelFile` - input kernel file name
      - `referenceKernelFile` - file name for the reference kernel
      - `outKernelFile` - file name to output expanded kernel
    """

    if 'format' in args :
        format = args['format']
    else :
        format = 'gist'
    delim = '\t'

    from datafunc import KernelData
    import misc
    import numpy

    inKernel = KernelData(inKernelFile)
    refKernel = KernelData(referenceKernelFile)
    print 'loaded data'
    ids = refKernel.labels.patternID[:]
    ids.sort()
    if ids != refKernel.labels.patternID :
        raise ValueError, 'reference kernel not sorted'
    
    idDict = misc.list2dict(inKernel.labels.patternID)
    outKernel = open(outKernelFile, 'w')
    if format == 'gist' :
        outKernel.write(outKernelFile + delim)
        outKernel.write(delim.join(ids) + '\n')
    
    for i in range(len(refKernel)) :
        outKernel.write(id1 + delim)
        for j in range(len(refKernel)) :
            values = numpy.zeros(len(refKernel), numpy.float_)
            if ids[i] in idDict and ids[j] in idDict :
                values[j] = inKernel.kernel.eval(inKernel,
                                                 idDict[ids[i]],idDict[ids[j]])
            tokens = [str(value) for value in values]
            outKernel.write(delim.join(tokens) + '\n')
Пример #17
0
    def constructFromFile(self, file_name, **args) :

        if 'data' not in args :
            raise ValueError, 'missing data object'
        self._data = args['data']

        id_dict = misc.list2dict(self._data.labels.patternID,
                                 range(len(self._data)))
        file_handle = open(file_name)
        L = []
        sets = []
        for line in file_handle :
            tokens = line.split()
            sets.append([id_dict[token] for token in tokens[:-1] ])
            L.append(tokens[-1])
        self.n = len(sets)
        self.callConstructor(len(sets))
        for s in sets :
            self.add(tuple(s))
        labels = Labels(L)
        WrapperDataSet.attachLabels(self, labels)
Пример #18
0
    def updateFeatureDict(self, arg = None) :

        if arg.__class__ == self.__class__ :   
            # features were extended with those in another dataset
            other = arg
            self.featureID.extend(other.featureID)
        elif type(arg) == list :
            #features were eliminated:
            eliminated = misc.list2dict(arg)
            self.featureID = [self.featureID[i] for i in range(len(self.featureID))
                              if i not in eliminated]
        elif type(arg) == type(1) or type(arg) == type('') :
            # a feature was added
            id = arg
            self.featureID.append(id)
            self.featureDict[id] = self.numFeatures - 1
            return

        self.featureDict = {}
        for i in range(len(self.featureID)) :
            self.featureDict[self.featureID[i]] = i
Пример #19
0
    def updateFeatureDict(self, arg = None) :
        
        if arg.__class__ == self.__class__ :
            other = arg
            self.featureID.extend(other.featureID)
            self.featureID.sort(cmp = lambda x,y : cmp(hash(x), hash(y)))
        elif type(arg) == list :
            #features were eliminated:
            eliminated = misc.list2dict(arg)
            self.featureID = [self.featureID[i] for i in range(len(self.featureID))
                              if i not in eliminated]
        elif type(arg) == type(1) or type(arg) == type('') :
            # a feature was added:
            id = arg
            self.featureID.append(id)
            self.featureID.sort(cmp = lambda x,y : cmp(hash(x), hash(y)))

        self.featureDict = {}
        self.featureKeyDict = {}
        for i in range(len(self.featureID)) :
            self.featureDict[self.featureID[i]] = i
            self.featureKeyDict[hash(self.featureID[i])] = i
Пример #20
0
    def score2(self, data, **args):

        featuresPerForest = int(math.ceil(
            float(self.maxSize) / len(data))) - 10
        numForests = int(math.ceil(
            float(data.numFeatures) / featuresPerForest))
        perm = range(data.numFeatures)
        random.shuffle(perm)
        scores = numpy.zeros(data.numFeatures, numpy.Float)
        pvals = numpy.zeros(data.numFeatures, numpy.Float)
        featureIDdict = misc.list2dict(data.featureID, range(data.numFeatures))
        print 'numForests', numForests, featuresPerForest, data.numFeatures
        for i in range(numForests):
            print 'forest number', i + 1
            if i < numForests - 1:
                features = perm[featuresPerForest * i:featuresPerForest *
                                (i + 1)]
            else:
                features = perm[featuresPerForest * i:]
            subdata = data.__class__(data, 'deepcopy')
            subdata.keepFeatures(features)
            subscores = self.score(subdata, **args)
            for j in range(subdata.numFeatures):
                scores[featureIDdict[subdata.featureID[j]]] = subscores[j]
                pvals[featureIDdict[subdata.featureID[j]]] = self.pvals[j]
        # re-rank all the best features together:
        bestFeatures = numpy.argsort(scores)[:featuresPerForest]
        print featuresPerForest
        print 'length of best Features', len(bestFeatures)
        subdata = data.__class__(data, 'deepcopy')
        subdata.keepFeatures(bestFeatures)
        subscores = self.score(subdata, **args)
        for j in range(subdata.numFeatures):
            scores[featureIDdict[subdata.featureID[j]]] = subscores[j]
            pvals[featureIDdict[subdata.featureID[j]]] = self.pvals[j]

        self.pvals = pvals
        return scores
Пример #21
0
    def updateFeatureDict(self, arg=None):

        if arg.__class__ == self.__class__:
            # features were extended with those in another dataset
            other = arg
            self.featureID.extend(other.featureID)
        elif type(arg) == list:
            #features were eliminated:
            eliminated = misc.list2dict(arg)
            self.featureID = [
                self.featureID[i] for i in range(len(self.featureID))
                if i not in eliminated
            ]
        elif type(arg) == type(1) or type(arg) == type(''):
            # a feature was added
            id = arg
            self.featureID.append(id)
            self.featureDict[id] = self.numFeatures - 1
            return

        self.featureDict = {}
        for i in range(len(self.featureID)):
            self.featureDict[self.featureID[i]] = i
Пример #22
0
    def score2(self, data, **args) :

        featuresPerForest = int(math.ceil(float(self.maxSize) / len(data))) - 10
        numForests = int(math.ceil(float(data.numFeatures) /
                                   featuresPerForest))
        perm = range(data.numFeatures)
        random.shuffle(perm)
        scores = numpy.zeros(data.numFeatures, numpy.Float)
        pvals = numpy.zeros(data.numFeatures, numpy.Float)
        featureIDdict = misc.list2dict(data.featureID, range(data.numFeatures))
        print 'numForests', numForests, featuresPerForest, data.numFeatures
        for i in range(numForests) :
            print 'forest number', i + 1
            if i < numForests - 1 :
                features = perm[featuresPerForest * i :
                                featuresPerForest * (i + 1)]
            else :
                features = perm[featuresPerForest * i : ]
            subdata = data.__class__(data, 'deepcopy')
            subdata.keepFeatures(features)
            subscores = self.score(subdata, **args)
            for j in range(subdata.numFeatures) :
                scores[featureIDdict[subdata.featureID[j]]] = subscores[j]
                pvals[featureIDdict[subdata.featureID[j]]] = self.pvals[j]
        # re-rank all the best features together:
        bestFeatures = numpy.argsort(scores)[:featuresPerForest]
        print featuresPerForest
        print 'length of best Features', len(bestFeatures)
        subdata = data.__class__(data, 'deepcopy')
        subdata.keepFeatures(bestFeatures)
        subscores = self.score(subdata, **args)
        for j in range(subdata.numFeatures) :
            scores[featureIDdict[subdata.featureID[j]]] = subscores[j]
            pvals[featureIDdict[subdata.featureID[j]]] = self.pvals[j]

        self.pvals = pvals
        return scores
Пример #23
0
def commonKernel(kernelFile1, kernelFile2, kernelOutFileName1, kernelOutFileName2) :
    
    delim = ' '
    from datafunc import KernelData
    import misc
    kdata1 = KernelData(kernelFile1)
    kdata2 = KernelData(kernelFile2)
    print 'loaded data'
    ids = misc.intersect(kdata1.labels.patternID, kdata2.labels.patternID)
    ids.sort()
    idDict1 = misc.list2dict(ids)

    if len(ids) != len(kdata1) :
        kernelOutFile1 = open(kernelOutFileName1, 'w')
        idDict = {}
        for i in range(len(kdata1)) :
            if kdata1.labels.patternID[i] in idDict1 :
                idDict[kdata1.labels.patternID[i]] = i
        for id1 in ids :
            print id1
            kernelOutFile1.write(id1 + delim)
            tokens = [str(kdata1.kernel.eval(kdata1, idDict[id1], idDict[id2]))
                      for id2 in ids]
            kernelOutFile1.write(delim.join(tokens) + '\n')
            
    if len(ids) != len(kdata2) :
        kernelOutFile2 = open(kernelOutFileName2, 'w')
        idDict = {}
        for i in range(len(kdata2)) :
            if kdata2.labels.patternID[i] in idDict1 :
                idDict[kdata2.labels.patternID[i]] = i
        for id1 in ids :
            print id1
            kernelOutFile2.write(id1 + delim)
            tokens = [str(kdata2.kernel.eval(kdata2, idDict[id1], idDict[id2]))
                      for id2 in ids]
            kernelOutFile2.write(delim.join(tokens) + '\n')
Пример #24
0
    def updateFeatureDict(self, arg=None):

        if arg.__class__ == self.__class__:
            other = arg
            self.featureID.extend(other.featureID)
            self.featureID.sort(cmp=lambda x, y: cmp(hash(x), hash(y)))
        elif type(arg) == list:
            #features were eliminated:
            eliminated = misc.list2dict(arg)
            self.featureID = [
                self.featureID[i] for i in range(len(self.featureID))
                if i not in eliminated
            ]
        elif type(arg) == type(1) or type(arg) == type(''):
            # a feature was added:
            id = arg
            self.featureID.append(id)
            self.featureID.sort(cmp=lambda x, y: cmp(hash(x), hash(y)))

        self.featureDict = {}
        self.featureKeyDict = {}
        for i in range(len(self.featureID)):
            self.featureDict[self.featureID[i]] = i
            self.featureKeyDict[hash(self.featureID[i])] = i
Пример #25
0
def stratifiedCV(classifier, data, numFolds = 5, **args) :
    """perform k-fold stratified cross-validation; in each fold the number of
    patterns from each class is proportional to the relative fraction of the
    class in the dataset

    :Parameters:
      - `classifier` - a classifier template
      - `data` - a dataset
      - `numFolds` - number of cross validation folds (default = 5)
      
    :Returns:
      a Results object.

    :Keywords:
      - `numFolds` - number of cross-validation folds -- overrides the numFolds parameter
      - `seed` - random number generator seed
      - `trainingAllFolds` - a list of patterns that are to be used as training
        examples in all CV folds.
      - `intermediateFile` - a file name to save intermediate results under
        if this argument is not given, not intermediate results are saved
      - `foldsToPerform` - number of folds to actually perform (in case you're doing
        n fold CV, and want to save time, and only do some of the folds)
    """

    if 'numFolds' in args :
        numFolds = args['numFolds']
    if 'seed' in args :
        random.seed(args['seed'])
    if 'trainingAllFolds' in args :
        trainingAllFolds = args['trainingAllFolds']
    else :
        trainingAllFolds = []
    foldsToPerform = numFolds
    if 'foldsToPerform' in args :
        foldsToPerform = args['foldsToPerform']
    if foldsToPerform > numFolds :
        raise ValueError, 'foldsToPerform > numFolds'

    trainingAllFoldsDict = misc.list2dict(trainingAllFolds)

    labels = data.labels
    p = [[] for i in range(labels.numClasses)] 
    classFoldSize = [int(labels.classSize[k] / numFolds) for k in range(labels.numClasses)]

    for i in range(len(data)):
        if i not in trainingAllFoldsDict :
            p[labels.Y[i]].append(i)
    for k in range(labels.numClasses):
        random.shuffle(p[k])

    trainingPatterns = [[] for i in range(foldsToPerform)]
    testingPatterns = [[] for i in range(foldsToPerform)]
    for fold in range(foldsToPerform) :
        for k in range(labels.numClasses) :
            classFoldStart = classFoldSize[k] * fold
            if fold < numFolds-1:
                classFoldEnd = classFoldSize[k] * (fold + 1)
            else:
                classFoldEnd = labels.classSize[k]
            testingPatterns[fold].extend(p[k][classFoldStart:classFoldEnd])
            if fold > 0:
                trainingPatterns[fold].extend(p[k][0:classFoldStart] +
                                              p[k][classFoldEnd:labels.classSize[k]])
            else:
                trainingPatterns[fold].extend(p[k][classFoldEnd:labels.classSize[k]])

    if len(trainingPatterns) > 0 :
        for fold in range(len(trainingPatterns)) :
            trainingPatterns[fold].extend(trainingAllFolds)
        
    return cvFromFolds(classifier, data, trainingPatterns, testingPatterns, **args)
Пример #26
0
    def save(self, fileName, **args):
        """save a dataset to a file (does not use pickle!)

        :Parameters:
          - `fileName` - a file name or a file handle

        :Keywords:
          - `format` - 'csv' or 'sparse'; by default format is chosen by the
            type of the dataset -- sparse containers save in sparse format
            and non-sparse containers in csv format.
          - `delimiter` - which delimiter to use when saving in csv format
          - `patterns` - save only those patterns whose indices are given
          - `ids` - save only those patterns whose pattern ID are given
          - `sortByID` - whether to sort the lines according to the pattern ID
            (default = False)
          - `sortByLabel` - whether to sort the lines according to the class label
            (default = False)
        """

        print 'saving to ', fileName
        if type(fileName) == type(''):
            fileHandle = open(fileName, 'w')
        else:
            fileHandle = fileName

        L = self.labels.L

        if self.__class__.__name__.lower().find('sparse') >= 0:
            format = 'sparse'
        else:
            format = 'csv'
        print 'detected file format as:', format
        if 'format' in args:
            format = args['format']
        if 'delimiter' in args:
            delim = args['delimiter']
        else:
            delim = ','
        if 'patterns' in args:
            patterns = args['patterns']
        else:
            patterns = range(len(self))
        if 'ids' in args:
            idDict = misc.list2dict(args['ids'])
            patterns = [
                i for i in range(len(self))
                if self.labels.patternID[i] in idDict
            ]
        if 'sortByID' in args and args['sortByID']:
            ids = self.labels.patternID[:]
            ids.sort()
            idMap = misc.list2dict(self.labels.patternID, range(len(self)))
            idDict = misc.list2dict(patterns)
            patterns = [idMap[id] for id in ids if idMap[id] in idDict]
        if 'sortByLabel' in args and args['sortByLabel']:
            y = self.labels.Y[:]
            patterns = numpy.argsort(self.labels.Y)

        if format == 'csv':
            if L is None:
                labels = ''
            else:
                labels = 'labels' + delim
            fileHandle.write('#' + 'patternID' + delim + labels +
                             delim.join(self.featureID) + '\n')
        for i in patterns:
            x = self.getPattern(i)
            if format == 'sparse':
                if self.labels.patternID is not None:
                    fileHandle.write(str(self.labels.patternID[i]) + ',')
                if L is not None:
                    if type(L[i]) == type([]):
                        fileHandle.write(';'.join(L[i]) + ' ')
                    else:
                        fileHandle.write(str(L[i]) + ' ')
                if type(x) == type({}):
                    tokens = [
                        self.featureID[self.featureKeyDict[key]] + ':' +
                        str(x[key]) for key in x
                    ]
                else:
                    tokens = [
                        self.featureID[i] + ':' + str(x[i])
                        for i in range(self.numFeatures) if x[i] != 0
                    ]
                fileHandle.write(' '.join(tokens) + '\n')
            else:
                if self.labels.patternID is not None:
                    fileHandle.write(str(self.labels.patternID[i]) + delim)
                if L is not None:
                    if type(L[i]) == type([]):
                        fileHandle.write(';'.join(L[i]) + delim)
                    else:
                        fileHandle.write(L[i] + delim)
                if type(x) == type({}):
                    tokens = [
                        str(x.get(self.featureKey[i], 0))
                        for i in range(self.numFeatures)
                    ]
                else:
                    tokens = [str(val) for val in x]
                fileHandle.write(delim.join(tokens) + '\n')
        fileHandle.close()
Пример #27
0
def stratifiedCV(classifier, data, numFolds=5, **args):
    """perform k-fold stratified cross-validation; in each fold the number of
    patterns from each class is proportional to the relative fraction of the
    class in the dataset

    :Parameters:
      - `classifier` - a classifier template
      - `data` - a dataset
      - `numFolds` - number of cross validation folds (default = 5)
      
    :Returns:
      a Results object.

    :Keywords:
      - `numFolds` - number of cross-validation folds -- overrides the numFolds parameter
      - `seed` - random number generator seed
      - `trainingAllFolds` - a list of patterns that are to be used as training
        examples in all CV folds.
      - `intermediateFile` - a file name to save intermediate results under
        if this argument is not given, not intermediate results are saved
      - `foldsToPerform` - number of folds to actually perform (in case you're doing
        n fold CV, and want to save time, and only do some of the folds)
    """

    if 'numFolds' in args:
        numFolds = args['numFolds']
    if 'seed' in args:
        random.seed(args['seed'])
    if 'trainingAllFolds' in args:
        trainingAllFolds = args['trainingAllFolds']
    else:
        trainingAllFolds = []
    foldsToPerform = numFolds
    if 'foldsToPerform' in args:
        foldsToPerform = args['foldsToPerform']
    if foldsToPerform > numFolds:
        raise ValueError, 'foldsToPerform > numFolds'

    trainingAllFoldsDict = misc.list2dict(trainingAllFolds)

    labels = data.labels
    p = [[] for i in range(labels.numClasses)]
    classFoldSize = [
        int(labels.classSize[k] / numFolds) for k in range(labels.numClasses)
    ]

    for i in range(len(data)):
        if i not in trainingAllFoldsDict:
            p[labels.Y[i]].append(i)
    for k in range(labels.numClasses):
        random.shuffle(p[k])

    trainingPatterns = [[] for i in range(foldsToPerform)]
    testingPatterns = [[] for i in range(foldsToPerform)]
    for fold in range(foldsToPerform):
        for k in range(labels.numClasses):
            classFoldStart = classFoldSize[k] * fold
            if fold < numFolds - 1:
                classFoldEnd = classFoldSize[k] * (fold + 1)
            else:
                classFoldEnd = labels.classSize[k]
            testingPatterns[fold].extend(p[k][classFoldStart:classFoldEnd])
            if fold > 0:
                trainingPatterns[fold].extend(
                    p[k][0:classFoldStart] +
                    p[k][classFoldEnd:labels.classSize[k]])
            else:
                trainingPatterns[fold].extend(
                    p[k][classFoldEnd:labels.classSize[k]])

    if len(trainingPatterns) > 0:
        for fold in range(len(trainingPatterns)):
            trainingPatterns[fold].extend(trainingAllFolds)

    return cvFromFolds(classifier, data, trainingPatterns, testingPatterns,
                       **args)
Пример #28
0
    def save(self, fileName, **args) :
        """save a dataset to a file (does not use pickle!)

        :Parameters:
          - `fileName` - a file name or a file handle

        :Keywords:
          - `format` - 'csv' or 'sparse'; by default format is chosen by the
            type of the dataset -- sparse containers save in sparse format
            and non-sparse containers in csv format.
          - `delimiter` - which delimiter to use when saving in csv format
          - `patterns` - save only those patterns whose indices are given
          - `ids` - save only those patterns whose pattern ID are given
          - `sortByID` - whether to sort the lines according to the pattern ID
            (default = False)
          - `sortByLabel` - whether to sort the lines according to the class label
            (default = False)
        """

        print 'saving to ', fileName
        if type(fileName) == type('') :
            fileHandle = open(fileName, 'w')
        else :
            fileHandle = fileName

        L = self.labels.L

        if self.__class__.__name__.lower().find('sparse') >= 0 :
            format = 'sparse'
        else :
            format = 'csv'
        print 'detected file format as:', format
        if 'format' in args :
            format = args['format']
        if 'delimiter' in args :
            delim = args['delimiter']
        else :
            delim = ','
        if 'patterns' in args :
            patterns = args['patterns']
        else :
            patterns = range(len(self))
        if 'ids' in args :
            idDict = misc.list2dict(args['ids'])
            patterns = [i for i in range(len(self))
                        if self.labels.patternID[i] in idDict]
        if 'sortByID' in args and args['sortByID'] :
            ids = self.labels.patternID[:]
            ids.sort()
            idMap = misc.list2dict(self.labels.patternID, range(len(self)))
            idDict = misc.list2dict(patterns)
            patterns = [idMap[id] for id in ids
                        if idMap[id] in idDict]
        if 'sortByLabel' in args and args['sortByLabel'] :
            y = self.labels.Y[:]
	    patterns = numpy.argsort(self.labels.Y)

        if format == 'csv' :
            if L is None :
                labels = ''
            else :
                labels = 'labels' + delim
            fileHandle.write('#' + 'patternID' + delim + labels + 
                             delim.join(self.featureID) + '\n')
        for i in patterns :
            x = self.getPattern(i)
            if format == 'sparse' :
                if self.labels.patternID is not None :
                    fileHandle.write(str(self.labels.patternID[i]) + ',')
                if L is not None :
                    if type(L[i]) == type([]) :
                        fileHandle.write(';'.join(L[i]) + ' ')
                    else :
                        fileHandle.write(str(L[i]) + ' ')
                if type(x) == type({}) :
                    tokens = [self.featureID[self.featureKeyDict[key]]+':'+
                              str(x[key]) for key in x]
                else :
                    tokens = [self.featureID[i] + ':' + str(x[i])
                              for i in range(self.numFeatures)
                              if x[i] != 0]
                fileHandle.write(' '.join(tokens) + '\n')
            else :
                if self.labels.patternID is not None :
                    fileHandle.write(str(self.labels.patternID[i]) + delim)
                if L is not None :
                    if type(L[i]) == type([]) :
                        fileHandle.write(';'.join(L[i]) + delim)
                    else :
                        fileHandle.write(L[i] + delim)
                if type(x) == type({}) :
                    tokens = [str(x.get(self.featureKey[i],0))
                              for i in range(self.numFeatures)]
                else :
                    tokens = [str(val) for val in x]
                fileHandle.write(delim.join(tokens) + '\n')
        fileHandle.close()