Exemplo n.º 1
0
 def nodeAsClf(self, term):
     clf = loadClf(self.ontology[term]['name'], self.fold, self.clfName)
     #clf.predict_proba = lambda *a: self.predictions[term] #HACK
     #clf.__dict__['predict_proba'] = lambda *a: self.predictions[term] #HACK
     class FakeClassifier:
         def predict_proba(*a): 
             return self.predictions[term]
     for a in ('X_test','y_test',):
     #    if a != 'predict_proba':
         setattr(FakeClassifier, a, getattr(clf, a))
     return FakeClassifier()
Exemplo n.º 2
0
    def nodeAsClf(self, term):
        clf = loadClf(self.ontology[term]['name'], self.fold, self.clfName)

        #clf.predict_proba = lambda *a: self.predictions[term] #HACK
        #clf.__dict__['predict_proba'] = lambda *a: self.predictions[term] #HACK
        class FakeClassifier:
            def predict_proba(*a):
                return self.predictions[term]

        for a in (
                'X_test',
                'y_test',
        ):
            #    if a != 'predict_proba':
            setattr(FakeClassifier, a, getattr(clf, a))
        return FakeClassifier()
Exemplo n.º 3
0
def plotRoc(term, clfName, title, clfs=None):

    mean_tpr = 0.0
    mean_fpr = numpy.linspace(0, 1, 100)
    all_tpr = []
    #plt.clf()
    if clfs is None:
        clfs = (loadClf(term, fold, clfName) for fold in range(NUM_FOLDS))
    for i, clf in enumerate(clfs):
        print("roc", clf)
        #for i, (clf, X_train, y_train, X_test, y_test, X_validation, y_validation,_,_,_) in enumerate(folds):

        probabs = clf.predict_proba(clf.X_test)
        try:
            fpr, tpr, _ = roc_curve(clf.y_test, probabs[:, 1])
        except IndexError:
            fpr, tpr, _ = roc_curve(clf.y_test,
                                    probabs[:, 0],
                                    pos_label=POSTIVE_LABEL)
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        fpr = numpy.nan_to_num(fpr)
        try:
            roc_auc = auc(fpr, tpr)
        except ValueError:  # root node
            roc_auc = 1.

        clf.roc_auc = roc_auc
        plt.plot(fpr, tpr, lw=1, label='Fold %d, AUC = %0.2f' % (i, roc_auc))

    mean_tpr /= NUM_FOLDS
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr,
             mean_tpr,
             'k--',
             label='Mean, AUC = %0.2f' % mean_auc,
             lw=2)

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6))
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic: ' + title)
    plt.legend(loc="lower right", prop=legendprop)
Exemplo n.º 4
0
def plotRoc(term, clfName, title, clfs = None):

    mean_tpr = 0.0
    mean_fpr = numpy.linspace(0, 1, 100)
    all_tpr = []
    #plt.clf()
    if clfs is None:
        clfs = (loadClf(term, fold, clfName) for fold in range(NUM_FOLDS))
    for i,clf in enumerate(clfs):
        print("roc", clf)
    #for i, (clf, X_train, y_train, X_test, y_test, X_validation, y_validation,_,_,_) in enumerate(folds):

        probabs = clf.predict_proba(clf.X_test)
        try:
            fpr, tpr, _ = roc_curve(clf.y_test, probabs[:, 1])
        except IndexError:
            fpr, tpr, _ = roc_curve(clf.y_test, probabs[:, 0], pos_label=POSTIVE_LABEL)
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        fpr = numpy.nan_to_num(fpr)
        try:
            roc_auc = auc(fpr, tpr)
        except ValueError: # root node
            roc_auc = 1.

        clf.roc_auc = roc_auc
        plt.plot(fpr, tpr, lw=1, label='Fold %d, AUC = %0.2f' % (i, roc_auc))

    mean_tpr /= NUM_FOLDS
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--',
            label='Mean, AUC = %0.2f' % mean_auc, lw=2)

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6))
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic: '+title)
    plt.legend(loc="lower right", prop=legendprop)
Exemplo n.º 5
0
    def generateCPD(self, term): #, X_train, y_train, X_test, y_test, X_validation, y_validation, g_train, g_test, g_validation):
        
        clf = loadClf(self.ontology[term]['name'], self.fold, self.clfName)
        posTrain = sum(clf.y_train == POSTIVE_LABEL)
        negTrain = sum(clf.y_train == NEGATIVE_LABEL)
        totalTrain = posTrain + negTrain

        children = sorted(self.ontology[term]['children'])
        parents = sorted(self.ontology[term]['parents'])


        labels = {l : PRIOR for l in product(*((POSTIVE_LABEL,NEGATIVE_LABEL),)*(len(children)+1))}
        if children:
            childNodes = [self.ontology[child]['node'][self.fold][self.clfName] for child in children]
            for gene,y in zip(clf.g_train, clf.y_train):
                event = []
                for child in children:
                    event.append(POSTIVE_LABEL if gene in self.ontology.associations[child] else NEGATIVE_LABEL)
                event.append(POSTIVE_LABEL if gene in self.ontology.associations[term] else NEGATIVE_LABEL)
                assert (gene in self.ontology.associations[term]) == (y == POSTIVE_LABEL)
                event = tuple(event)

                labels[event] += 1
            def countBoth(event):
                return labels[event[:-1]+(POSTIVE_LABEL,)] + labels[event[:-1]+(NEGATIVE_LABEL,)]
            cprior = PRIOR * (2 ** len(children))
            
            types = [Mixture]*(len(children)-1) + [Categorical]
            mixparams = [i for s in zip(childNodes, types) for i in s]
            cpd = numpy.empty((2,)*(len(children)+1))
            for event, counted in labels.items():
                v=cpd
                for b in event[:-1]:
                    v = v[b]
                    
                hid = event[-1]
                print("Event: ", event)
                if POSTIVE_LABEL not in event[:-1]: # Všichni potomci označeni "ne"
                    v[hid] = counted/countBoth(event)
                    print("Stored %d / %d" % (counted,countBoth(event)))
                else:
                    v[hid] = {POSTIVE_LABEL: 0.99, NEGATIVE_LABEL:0.01}[hid]
                    print("Stored %d : %d" % (hid, v[hid]))


            #print(term,"<-",",".join(children))
            print(cpd)
            #print(labels)

            hidden = Mixture(*mixparams, cpd)
            hidden.params = cpd
            

        else: #No children
            #hidden = DiscreteDistribution({'0': posTrain / totalTrain, '1': negTrain / totalTrain})
            params = (posTrain / totalTrain, negTrain / totalTrain)
            hidden = Categorical(params)
            hidden.params = params

        #print("Hidden node %s:" % term)
        #print(repr(hidden))
        #print([p for p in hidden.parents if isinstance(p, Stochastic)])
        #print(hidden.get_moments())

        conf = clf.conf + PRIOR
        #posTest, negTest = numpy.sum(conf, 1) 
        posTest, negTest = numpy.sum(conf, 0) 
        #print("Confusion matrix:")
        #print(conf)
       
        try:
            assert term != self.ontology.root
            pos_decisions = clf.decision_function(clf.X_test[clf.y_test==POSTIVE_LABEL])
            neg_decisions = clf.decision_function(clf.X_test[clf.y_test==NEGATIVE_LABEL])
            means = [numpy.mean(pos_decisions)], [numpy.mean(neg_decisions)]
            maxprec = 100.0
            precs = [[numpy.min((1/numpy.var(pos_decisions), maxprec))]], [[numpy.min((1/numpy.var(neg_decisions), maxprec))]]
        #else:
        except (ValueError, AssertionError):
            means = [-1.], [1.]
            precs = [[1.]], [[1.]]
        print("Gaussian params:", term, self.ontology[term]['name'], means, precs)
        observed = Mixture(hidden, Gaussian, means, precs)
        #observed = ConditionalProbabilityTable([
        #        ['0', '0', conf[0][0] / posTest], # if term != root else 1.],
        #        ['0', '1', conf[0][1] / posTest], # if term != root else 0.],
        #        ['1', '0', conf[1][0] / negTest], # if term != root else 0.],
        #        ['1', '1', conf[1][1] / negTest]], #if term != root else 1.]],
        #    [hidden.distribution])
        #print("Observed node %s - %s:" % (term, self.ontology[term]['name']))
        #print(repr(observed))
        #print([p for p in observed.parents if isinstance(p, Stochastic)])

        self.ontology[term]['node'][self.fold][self.clfName] = hidden
        #self.ontology[term]['clf'][self.fold][self.clfName] = clf, X_validation, y_validation, g_validation
        assert self.lenValidation is None or self.lenValidation == len(clf.y_validation)
        self.lenValidation = len(clf.y_validation)
        self.allobserved[term] = observed
        self.allhidden[term] = hidden
        self.extranodes.update((p for p in hidden.parents if isinstance(p, Stochastic)))
Exemplo n.º 6
0
    def predict(self):
        #print(self.network.graph)
        self.predictions = {term : numpy.empty((self.lenValidation, 2), dtype=float) for term in self.ontology.ontology}

        classifiers = {
                term : lambda: loadClf(self.ontology[term]['name'], self.fold, self.clfName) #self.ontology[term]['clf'][self.fold][self.clfName]
                for term
                in self.ontology.ontology
                }
        #for term, (clf,X,y,g) in classifiers.items():
        #    print(term, ":", repr(self.clfName), repr(clf.name), self.fold, clf.fold)

        observations = {
                term : clf.decision_function(clf.X_validation) if term != self.ontology.root else numpy.array([-1.]*len(clf.X_validation))
                for term, clff #(clf, X, y, g)
                in classifiers.items()
                for clf in (clff(),)
                }
        #print("observations:")
        #print(observations)
        gt = {
                term : clf().y_validation
                for term, clf
                in classifiers.items()}
        #print("gt:")
        #print(gt)

        for i in range(self.lenValidation):
            observation = {term : pred[i] for term, pred in observations.items()}
            #print("Observation for gene %d" % i)
            #print(observation)
            #print(self.network.forward_backward(observation))
            hidden, observed, extra = self.getCopy()
            #print(hidden)
            #print(observed)
            #print(observation)
            #for term, node in hidden.items():
            #    print(i, term, node.get_moments()[0])
            for k,v in observation.items():
                observed[k].observe((v,))
            #    print("%s observes %s" % (k, v))
            allv = (*hidden.values(), *observed.values(), *extra)
            #print([(x, [p for p in x.parents if isinstance(p, Stochastic)]) for x in [*hidden.values(), *observed.values()]])
            Q = VB(*allv)
            Q.update(*allv, tol=1e-7, repeat=1000, verbose=True)
            #print("---")

            for term, node in hidden.items():
                #print(i, term, node.get_moments()[0])
                self.predictions[term][i,:] = node.get_moments()[0]

        #print("predictions:")
        #print(self.predictions)
        for term in observations:
                compare = numpy.empty((len(gt[term]),4), dtype=float)
                compare[:,0] = gt[term]
                compare[:,1] = observations[term]
                compare[:,2] = self.predictions[term][:,1]
                compare[:,3] = numpy.round(self.predictions[term][:,1])
                print(term, self.ontology[term]['name'])
                print(compare)
Exemplo n.º 7
0
    def generateCPD(
        self, term
    ):  #, X_train, y_train, X_test, y_test, X_validation, y_validation, g_train, g_test, g_validation):

        clf = loadClf(self.ontology[term]['name'], self.fold, self.clfName)
        posTrain = sum(clf.y_train == POSTIVE_LABEL)
        negTrain = sum(clf.y_train == NEGATIVE_LABEL)
        totalTrain = posTrain + negTrain

        children = sorted(self.ontology[term]['children'])
        parents = sorted(self.ontology[term]['parents'])

        labels = {
            l: PRIOR
            for l in product(*((POSTIVE_LABEL, NEGATIVE_LABEL), ) *
                             (len(children) + 1))
        }
        if children:
            childNodes = [
                self.ontology[child]['node'][self.fold][self.clfName]
                for child in children
            ]
            for gene, y in zip(clf.g_train, clf.y_train):
                event = []
                for child in children:
                    event.append(POSTIVE_LABEL if gene in self.ontology.
                                 associations[child] else NEGATIVE_LABEL)
                event.append(POSTIVE_LABEL if gene in self.ontology.
                             associations[term] else NEGATIVE_LABEL)
                assert (gene in self.ontology.associations[term]) == (
                    y == POSTIVE_LABEL)
                event = tuple(event)

                labels[event] += 1

            def countBoth(event):
                return labels[event[:-1] +
                              (POSTIVE_LABEL, )] + labels[event[:-1] +
                                                          (NEGATIVE_LABEL, )]

            cprior = PRIOR * (2**len(children))

            types = [Mixture] * (len(children) - 1) + [Categorical]
            mixparams = [i for s in zip(childNodes, types) for i in s]
            cpd = numpy.empty((2, ) * (len(children) + 1))
            for event, counted in labels.items():
                v = cpd
                for b in event[:-1]:
                    v = v[b]

                hid = event[-1]
                print("Event: ", event)
                if POSTIVE_LABEL not in event[:
                                              -1]:  # Všichni potomci označeni "ne"
                    v[hid] = counted / countBoth(event)
                    print("Stored %d / %d" % (counted, countBoth(event)))
                else:
                    v[hid] = {POSTIVE_LABEL: 0.99, NEGATIVE_LABEL: 0.01}[hid]
                    print("Stored %d : %d" % (hid, v[hid]))

            #print(term,"<-",",".join(children))
            print(cpd)
            #print(labels)

            hidden = Mixture(*mixparams, cpd)
            hidden.params = cpd

        else:  #No children
            #hidden = DiscreteDistribution({'0': posTrain / totalTrain, '1': negTrain / totalTrain})
            params = (posTrain / totalTrain, negTrain / totalTrain)
            hidden = Categorical(params)
            hidden.params = params

        #print("Hidden node %s:" % term)
        #print(repr(hidden))
        #print([p for p in hidden.parents if isinstance(p, Stochastic)])
        #print(hidden.get_moments())

        conf = clf.conf + PRIOR
        #posTest, negTest = numpy.sum(conf, 1)
        posTest, negTest = numpy.sum(conf, 0)
        #print("Confusion matrix:")
        #print(conf)

        try:
            assert term != self.ontology.root
            pos_decisions = clf.decision_function(
                clf.X_test[clf.y_test == POSTIVE_LABEL])
            neg_decisions = clf.decision_function(
                clf.X_test[clf.y_test == NEGATIVE_LABEL])
            means = [numpy.mean(pos_decisions)], [numpy.mean(neg_decisions)]
            maxprec = 100.0
            precs = [[numpy.min((1 / numpy.var(pos_decisions), maxprec))]
                     ], [[numpy.min((1 / numpy.var(neg_decisions), maxprec))]]
        #else:
        except (ValueError, AssertionError):
            means = [-1.], [1.]
            precs = [[1.]], [[1.]]
        print("Gaussian params:", term, self.ontology[term]['name'], means,
              precs)
        observed = Mixture(hidden, Gaussian, means, precs)
        #observed = ConditionalProbabilityTable([
        #        ['0', '0', conf[0][0] / posTest], # if term != root else 1.],
        #        ['0', '1', conf[0][1] / posTest], # if term != root else 0.],
        #        ['1', '0', conf[1][0] / negTest], # if term != root else 0.],
        #        ['1', '1', conf[1][1] / negTest]], #if term != root else 1.]],
        #    [hidden.distribution])
        #print("Observed node %s - %s:" % (term, self.ontology[term]['name']))
        #print(repr(observed))
        #print([p for p in observed.parents if isinstance(p, Stochastic)])

        self.ontology[term]['node'][self.fold][self.clfName] = hidden
        #self.ontology[term]['clf'][self.fold][self.clfName] = clf, X_validation, y_validation, g_validation
        assert self.lenValidation is None or self.lenValidation == len(
            clf.y_validation)
        self.lenValidation = len(clf.y_validation)
        self.allobserved[term] = observed
        self.allhidden[term] = hidden
        self.extranodes.update(
            (p for p in hidden.parents if isinstance(p, Stochastic)))
Exemplo n.º 8
0
    def predict(self):
        #print(self.network.graph)
        self.predictions = {
            term: numpy.empty((self.lenValidation, 2), dtype=float)
            for term in self.ontology.ontology
        }

        classifiers = {
            term: lambda:
            loadClf(self.ontology[term]['name'], self.fold, self.clfName
                    )  #self.ontology[term]['clf'][self.fold][self.clfName]
            for term in self.ontology.ontology
        }
        #for term, (clf,X,y,g) in classifiers.items():
        #    print(term, ":", repr(self.clfName), repr(clf.name), self.fold, clf.fold)

        observations = {
            term: clf.decision_function(clf.X_validation) if
            term != self.ontology.root else numpy.array([-1.] *
                                                        len(clf.X_validation))
            for term, clff  #(clf, X, y, g)
            in classifiers.items() for clf in (clff(), )
        }
        #print("observations:")
        #print(observations)
        gt = {term: clf().y_validation for term, clf in classifiers.items()}
        #print("gt:")
        #print(gt)

        for i in range(self.lenValidation):
            observation = {
                term: pred[i]
                for term, pred in observations.items()
            }
            #print("Observation for gene %d" % i)
            #print(observation)
            #print(self.network.forward_backward(observation))
            hidden, observed, extra = self.getCopy()
            #print(hidden)
            #print(observed)
            #print(observation)
            #for term, node in hidden.items():
            #    print(i, term, node.get_moments()[0])
            for k, v in observation.items():
                observed[k].observe((v, ))
            #    print("%s observes %s" % (k, v))
            allv = (*hidden.values(), *observed.values(), *extra)
            #print([(x, [p for p in x.parents if isinstance(p, Stochastic)]) for x in [*hidden.values(), *observed.values()]])
            Q = VB(*allv)
            Q.update(*allv, tol=1e-7, repeat=1000, verbose=True)
            #print("---")

            for term, node in hidden.items():
                #print(i, term, node.get_moments()[0])
                self.predictions[term][i, :] = node.get_moments()[0]

        #print("predictions:")
        #print(self.predictions)
        for term in observations:
            compare = numpy.empty((len(gt[term]), 4), dtype=float)
            compare[:, 0] = gt[term]
            compare[:, 1] = observations[term]
            compare[:, 2] = self.predictions[term][:, 1]
            compare[:, 3] = numpy.round(self.predictions[term][:, 1])
            print(term, self.ontology[term]['name'])
            print(compare)