Пример #1
0
def extractfeatures(AIlist, pfds):
    """
    given a list of AIs (eg. combinedAI.list_of_AIs)
    and a list of pfds (class pfdreader),
    pre-extract all the useful features.
    This is meant to reduce disk i/o and calls to pfd.dedisperse()
    #Auto extract p0 #2013/04/29
    """

    #determine features to extract from pfd
    features = {}
    vargf = [{'ratings': ['period']}]  # auto extract P0
    items = []
    for clf in AIlist:
        items.extend(clf.feature.items())

    newf = set(['%s:%s' % (f, v)
                for f, v in items]) - set(pfds[0].extracted_feature.keys())
    for p in newf:
        f, v = p.split(':')
        vargf.append({f: int(v)})
    if len(vargf) > 0:

        def getfeature(pfd):
            pfd.getdata(*vargf, **features)
            return pfd

        resultdict = threadit(getfeature, [[p] for p in pfds])
        for n, pfd in resultdict.iteritems():
            if pfd == None:
                print 'ZeroDivisionError: ', pfds[n].pfdfile
                raise ZeroDivisionError
            pfds[n] = pfd
Пример #2
0
    def extractfeatures(self, clf):
        if type(clf) == list:
            AIlist = clf
        elif 'list_of_AIs' in clf.__dict__:
            AIlist = clf.list_of_AIs
        elif 'feature' in clf.__dict__:
            AIlist = [clf]
        else:
            raise MyError
        features = {}
        vargf = []
        items = []
        for clf in AIlist:
            items.extend(clf.feature.items())
        for f in set(items):
            if not f in self.extracted_feature:
                vargf.append(dict([f]))

        def getfeature(pfd):
            pfd.getdata(*vargf, **features)
            return pfd

        from ubc_AI.threadit import threadit
        if len(vargf) > 0:
            resultdict = threadit(getfeature, [[p] for p in self.pfds])
            for n, pfd in resultdict.iteritems():
                self.pfds[n] = pfd
        for f in vargf:
            self.extracted_feature.append(f)
Пример #3
0
 def extractfeatures(self, clf):
     if type(clf) == list:
         AIlist = clf
     elif 'list_of_AIs' in clf.__dict__:
         AIlist = clf.list_of_AIs
     elif 'feature' in clf.__dict__:
         AIlist = [clf] 
     else:
         raise MyError
     features = {}
     vargf = []
     items = []
     for clf in AIlist:
         items.extend(clf.feature.items())
     for f in set(items):
         if not f in self.extracted_feature:
             vargf.append(dict([f]))
     def getfeature(pfd):
         pfd.getdata(*vargf, **features)
         return pfd
     from ubc_AI.threadit import threadit
     if len(vargf) > 0:
         resultdict = threadit(getfeature, [[p] for p in self.pfds])
         for n, pfd in resultdict.iteritems():
             self.pfds[n] = pfd
     for f in vargf:
         self.extracted_feature.append(f)
Пример #4
0
def extractfeatures(AIlist, pfds):
    """
    given a list of AIs (eg. combinedAI.list_of_AIs)
    and a list of pfds (class pfdreader),
    pre-extract all the useful features.
    This is meant to reduce disk i/o and calls to pfd.dedisperse()
    #Auto extract p0 #2013/04/29
    """

    #determine features to extract from pfd
    features = {}
    vargf = [{'ratings':['period']}] # auto extract P0
    items = []
    for clf in AIlist:
        items.extend(clf.feature.items())

    newf = set([ '%s:%s'% (f,v)  for f,v in items]) - set(pfds[0].extracted_feature.keys())
    for p in newf:
        f,v = p.split(':')
        vargf.append({f:int(v)})
    if len(vargf) > 0:
        def getfeature(pfd):
            pfd.getdata(*vargf, **features)
            return pfd
        resultdict = threadit(getfeature, [[p] for p in pfds])
        for n, pfd in resultdict.iteritems():
            if pfd == None:
                print 'ZeroDivisionError: ', pfds[n].pfdfile
                raise ZeroDivisionError
            pfds[n] = pfd
Пример #5
0
def cross_validation(classifier, pfds, target, cv=5, verbose=False):
    #classifier = classifier()
    nclasses = len(np.unique(target))
    if verbose: cv = 1
    scores = np.array([])
    arglists = []
    for i in range(cv):
        L = len(pfds)
        pfds = np.array(pfds)
        index = range(L)
        # keep shuffling until training set has all types
        while 1:
            shuffle(index)
            cut = int(0.6 * L)
            training_idx = index[:cut]
            test_idx = index[cut:]

            training_pfds = pfds[training_idx]
            training_target = target[training_idx]
            test_pfds = pfds[test_idx]
            test_target = target[test_idx]
            if len(np.unique(training_target)) == len(np.unique(target)):
                break
        n_samples = len(training_pfds)
        #training_pfds = training_pfds.reshape((n_samples, -1))
        #classifier = svm.SVC(gamma=0.1, scale_C=False)
        arglists.append([
            classifier, training_pfds, training_target, test_pfds, test_target
        ])
        #classifier.fit(training_pfds, training_target)

    def getF1(clf, training_pfds, training_target, test_pfds, test_target):
        clf.fit(training_pfds, training_target)
        F1 = singleclass_score(clf, test_pfds, test_target, verbose=verbose)
        return F1

    if not nclasses == 2:
        raise "not yet implemented multiclass_score"
        #F1 = multiclass_score(classifier, test_pfds, test_target,
        #nclasses = nclasses, verbose=verbose)
    else:
        #F1 = singleclass_score(classifier, test_pfds, test_target, verbose=verbose)
        #if classifier.__dict__.has_key('strategy'):
        #F1dict = dict([(i,getF1(*al))for i,al in enumerate(arglists)])

        from ubc_AI.threadit import threadit
        if len(arglists) >= 12:
            F1dict = threadit(getF1, arglists)
        else:
            F1dict = dict([(i, getF1(*al)) for i, al in enumerate(arglists)])
    #scores = np.append(scores, F1)
    #print F1dict
    scores = np.array([F1dict[i] for i in F1dict])

    return scores
Пример #6
0
def threadpredict(AIlist, pfds):
    """
    Args:
    AIlist : list of trained classifiers
    pfds : list of pfds
    out : output format, one of 'transpose' or 'hstack'
    """
    def predictfunc(pfds, clf):
        return clf.predict(pfds)
    resultdict = threadit(predictfunc, [[pfds, clf] for clf in AIlist])
    return np.transpose([resultdict[n] for n in range(len(AIlist))])
Пример #7
0
def threadpredict(AIlist, pfds):
    """
    Args:
    AIlist : list of trained classifiers
    pfds : list of pfds
    out : output format, one of 'transpose' or 'hstack'
    """
    def predictfunc(pfds, clf):
        return clf.predict(pfds)

    resultdict = threadit(predictfunc, [[pfds, clf] for clf in AIlist])
    return np.transpose([resultdict[n] for n in range(len(AIlist))])
Пример #8
0
def cross_validation(classifier, pfds, target, cv=5, verbose=False):
    #classifier = classifier()
    nclasses = len(np.unique(target))
    if verbose:cv = 1
    scores = np.array([])
    arglists = []
    for i in range(cv):
        L = len(pfds)
        pfds = np.array(pfds)
        index = range(L)
# keep shuffling until training set has all types
        while 1:
            shuffle(index)
            cut = int(0.6*L)
            training_idx = index[:cut]
            test_idx = index[cut:]

            training_pfds = pfds[training_idx]
            training_target = target[training_idx]
            test_pfds = pfds[test_idx]
            test_target = target[test_idx]
            if len(np.unique(training_target)) == len(np.unique(target)):
                break
        n_samples = len(training_pfds)
        #training_pfds = training_pfds.reshape((n_samples, -1))
        #classifier = svm.SVC(gamma=0.1, scale_C=False)
        arglists.append([classifier, training_pfds, training_target, test_pfds, test_target])
        #classifier.fit(training_pfds, training_target)

    def getF1(clf, training_pfds, training_target, test_pfds, test_target):
        clf.fit(training_pfds, training_target)
        F1 = singleclass_score(clf, test_pfds, test_target, verbose=verbose)
        return F1

    if not nclasses == 2:
        raise "not yet implemented multiclass_score"
        #F1 = multiclass_score(classifier, test_pfds, test_target,
                              #nclasses = nclasses, verbose=verbose)
    else:
        #F1 = singleclass_score(classifier, test_pfds, test_target, verbose=verbose)
        #if classifier.__dict__.has_key('strategy'):
            #F1dict = dict([(i,getF1(*al))for i,al in enumerate(arglists)])

        from ubc_AI.threadit import threadit
        if len(arglists) >= 12:
            F1dict = threadit(getF1, arglists)
        else:
            F1dict = dict([(i,getF1(*al))for i,al in enumerate(arglists)])
    #scores = np.append(scores, F1)
    #print F1dict
    scores = np.array([F1dict[i] for i in F1dict])

    return scores
Пример #9
0
def threadpredict_proba(AIlist, pfds):
    """
    Args:
    AIlist : list of trained classifiers
    pfds : list of pfds
    """
    def predict_prob(clf):
        #try:
        p = clf.predict_proba(pfds)
        #except:
            #print 'Alarm!!!'
        return p
    resultdict = threadit(predict_prob, [[clf] for clf in AIlist])
    return np.hstack([resultdict[n] for n in range(len(AIlist))])
Пример #10
0
def threadpredict_proba(AIlist, pfds):
    """
    Args:
    AIlist : list of trained classifiers
    pfds : list of pfds
    """
    def predict_prob(clf):
        #try:
        p = clf.predict_proba(pfds)
        #except:
        #print 'Alarm!!!'
        return p

    resultdict = threadit(predict_prob, [[clf] for clf in AIlist])
    return np.hstack([resultdict[n] for n in range(len(AIlist))])
Пример #11
0
    def fit(self, pfds, target, **kwds):
        """
        args: [list of pfd instances], target

        Notes:
        following advice from http://en.wikipedia.org/wiki/Ensemble_learning
        we train each classifier on a subset of the training data
        
        """
        if target.ndim == 1:
            psrtarget = target
        else:
            psrtarget = target[..., 0]
        if not InteractivePy:
            #extract pfd features beforehand
            extractfeatures(self.list_of_AIs, pfds)

        input_data = []
        for n, clf in enumerate(self.list_of_AIs):
            tr_pfds, tr_target, te_pfds, te_target = split_data(pfds,
                                                                target,
                                                                pct=0.75)
            if InteractivePy:
                clf.fit(tr_pfds, tr_target, **kwds)
            else:
                input_data.append([clf, tr_pfds, tr_target, kwds])

        def threadfit(clf, tr_pfds, tr_target, kwds):
            clf.fit(tr_pfds, tr_target, **kwds)
            return clf

        if not InteractivePy:
            resultdict = threadit(threadfit, input_data)

            for n, clf in resultdict.iteritems():
                self.list_of_AIs[n] = clf

        self.nclasses = len(np.unique(target))
        if self.nclasses > 2 and self.strategy == 'adaboost':
            print "Warning, adaboost only works in 2-class systems"
            print "Reverting to Logistic Regression on the prediction matrix"
            self.strategy = 'lr'
            self.AIonAI = linear_model.LogisticRegression(penalty='l1')

        #train the AIonAI if used
        if (self.strategy in self.AIonAIs):
            if self.strategy not in self.req_predict:
                #use predict_prob
                if InteractivePy or (len(pfds) < 5 * num_workers):
                    predictions = np.hstack([clf.predict_proba(pfds)\
                                                 for clf in self.list_of_AIs]) #nsamples x (npred x nclasses)
                    #print predictions.shape
                else:
                    predictions = threadpredict_proba(self.list_of_AIs, pfds)
            else:
                #use predict
                if InteractivePy or (len(pfds) < 5 * num_workers):
                    predictions = np.transpose([clf.predict(pfds)\
                                                    for clf in self.list_of_AIs]) #nsamples x npred
                else:
                    predictions = threadpredict(self.list_of_AIs, pfds)

            predictions = np.array(predictions)  #nsamples x npred
            self.AIonAI.fit(predictions, psrtarget)
Пример #12
0
    def fit(self, pfds, target, **kwds):
        """
        args: [list of pfd instances], target

        Notes:
        following advice from http://en.wikipedia.org/wiki/Ensemble_learning
        we train each classifier on a subset of the training data
        
        """
        if target.ndim == 1:
            psrtarget = target
        else:
            psrtarget = target[...,0]
        if not InteractivePy:
            #extract pfd features beforehand
            extractfeatures(self.list_of_AIs, pfds)


        input_data = []
        for n, clf in enumerate(self.list_of_AIs):
            tr_pfds, tr_target, te_pfds, te_target = split_data(pfds, target, pct=0.75)
            if InteractivePy:
                clf.fit(tr_pfds, tr_target, **kwds)
            else:
                input_data.append([clf, tr_pfds, tr_target, kwds])
        def threadfit(clf, tr_pfds, tr_target, kwds):
            clf.fit(tr_pfds, tr_target, **kwds)
            return clf
        
        if not InteractivePy:
            resultdict = threadit(threadfit, input_data)

            for n, clf in resultdict.iteritems():
                self.list_of_AIs[n] = clf

        self.nclasses = len(np.unique(target))
        if self.nclasses > 2 and self.strategy == 'adaboost':
            print "Warning, adaboost only works in 2-class systems"
            print "Reverting to Logistic Regression on the prediction matrix"
            self.strategy = 'lr'
            self.AIonAI = linear_model.LogisticRegression(penalty='l1')

        #train the AIonAI if used
        if (self.strategy in self.AIonAIs):
            if self.strategy not in self.req_predict:
                #use predict_prob 
                if InteractivePy or (len(pfds) < 5*num_workers):
                    predictions = np.hstack([clf.predict_proba(pfds)\
                                                 for clf in self.list_of_AIs]) #nsamples x (npred x nclasses)
                    #print predictions.shape
                else:
                    predictions = threadpredict_proba(self.list_of_AIs, pfds)
            else:
                #use predict
                if InteractivePy or (len(pfds) < 5*num_workers):
                    predictions = np.transpose([clf.predict(pfds)\
                                                    for clf in self.list_of_AIs]) #nsamples x npred
                else:
                    predictions = threadpredict(self.list_of_AIs, pfds)

            predictions = np.array(predictions) #nsamples x npred
            self.AIonAI.fit(predictions, psrtarget)