def extractfeatures(AIlist, pfds): """ given a list of AIs (eg. combinedAI.list_of_AIs) and a list of pfds (class pfdreader), pre-extract all the useful features. This is meant to reduce disk i/o and calls to pfd.dedisperse() #Auto extract p0 #2013/04/29 """ #determine features to extract from pfd features = {} vargf = [{'ratings': ['period']}] # auto extract P0 items = [] for clf in AIlist: items.extend(clf.feature.items()) newf = set(['%s:%s' % (f, v) for f, v in items]) - set(pfds[0].extracted_feature.keys()) for p in newf: f, v = p.split(':') vargf.append({f: int(v)}) if len(vargf) > 0: def getfeature(pfd): pfd.getdata(*vargf, **features) return pfd resultdict = threadit(getfeature, [[p] for p in pfds]) for n, pfd in resultdict.iteritems(): if pfd == None: print 'ZeroDivisionError: ', pfds[n].pfdfile raise ZeroDivisionError pfds[n] = pfd
def extractfeatures(self, clf): if type(clf) == list: AIlist = clf elif 'list_of_AIs' in clf.__dict__: AIlist = clf.list_of_AIs elif 'feature' in clf.__dict__: AIlist = [clf] else: raise MyError features = {} vargf = [] items = [] for clf in AIlist: items.extend(clf.feature.items()) for f in set(items): if not f in self.extracted_feature: vargf.append(dict([f])) def getfeature(pfd): pfd.getdata(*vargf, **features) return pfd from ubc_AI.threadit import threadit if len(vargf) > 0: resultdict = threadit(getfeature, [[p] for p in self.pfds]) for n, pfd in resultdict.iteritems(): self.pfds[n] = pfd for f in vargf: self.extracted_feature.append(f)
def extractfeatures(AIlist, pfds): """ given a list of AIs (eg. combinedAI.list_of_AIs) and a list of pfds (class pfdreader), pre-extract all the useful features. This is meant to reduce disk i/o and calls to pfd.dedisperse() #Auto extract p0 #2013/04/29 """ #determine features to extract from pfd features = {} vargf = [{'ratings':['period']}] # auto extract P0 items = [] for clf in AIlist: items.extend(clf.feature.items()) newf = set([ '%s:%s'% (f,v) for f,v in items]) - set(pfds[0].extracted_feature.keys()) for p in newf: f,v = p.split(':') vargf.append({f:int(v)}) if len(vargf) > 0: def getfeature(pfd): pfd.getdata(*vargf, **features) return pfd resultdict = threadit(getfeature, [[p] for p in pfds]) for n, pfd in resultdict.iteritems(): if pfd == None: print 'ZeroDivisionError: ', pfds[n].pfdfile raise ZeroDivisionError pfds[n] = pfd
def cross_validation(classifier, pfds, target, cv=5, verbose=False): #classifier = classifier() nclasses = len(np.unique(target)) if verbose: cv = 1 scores = np.array([]) arglists = [] for i in range(cv): L = len(pfds) pfds = np.array(pfds) index = range(L) # keep shuffling until training set has all types while 1: shuffle(index) cut = int(0.6 * L) training_idx = index[:cut] test_idx = index[cut:] training_pfds = pfds[training_idx] training_target = target[training_idx] test_pfds = pfds[test_idx] test_target = target[test_idx] if len(np.unique(training_target)) == len(np.unique(target)): break n_samples = len(training_pfds) #training_pfds = training_pfds.reshape((n_samples, -1)) #classifier = svm.SVC(gamma=0.1, scale_C=False) arglists.append([ classifier, training_pfds, training_target, test_pfds, test_target ]) #classifier.fit(training_pfds, training_target) def getF1(clf, training_pfds, training_target, test_pfds, test_target): clf.fit(training_pfds, training_target) F1 = singleclass_score(clf, test_pfds, test_target, verbose=verbose) return F1 if not nclasses == 2: raise "not yet implemented multiclass_score" #F1 = multiclass_score(classifier, test_pfds, test_target, #nclasses = nclasses, verbose=verbose) else: #F1 = singleclass_score(classifier, test_pfds, test_target, verbose=verbose) #if classifier.__dict__.has_key('strategy'): #F1dict = dict([(i,getF1(*al))for i,al in enumerate(arglists)]) from ubc_AI.threadit import threadit if len(arglists) >= 12: F1dict = threadit(getF1, arglists) else: F1dict = dict([(i, getF1(*al)) for i, al in enumerate(arglists)]) #scores = np.append(scores, F1) #print F1dict scores = np.array([F1dict[i] for i in F1dict]) return scores
def threadpredict(AIlist, pfds): """ Args: AIlist : list of trained classifiers pfds : list of pfds out : output format, one of 'transpose' or 'hstack' """ def predictfunc(pfds, clf): return clf.predict(pfds) resultdict = threadit(predictfunc, [[pfds, clf] for clf in AIlist]) return np.transpose([resultdict[n] for n in range(len(AIlist))])
def cross_validation(classifier, pfds, target, cv=5, verbose=False): #classifier = classifier() nclasses = len(np.unique(target)) if verbose:cv = 1 scores = np.array([]) arglists = [] for i in range(cv): L = len(pfds) pfds = np.array(pfds) index = range(L) # keep shuffling until training set has all types while 1: shuffle(index) cut = int(0.6*L) training_idx = index[:cut] test_idx = index[cut:] training_pfds = pfds[training_idx] training_target = target[training_idx] test_pfds = pfds[test_idx] test_target = target[test_idx] if len(np.unique(training_target)) == len(np.unique(target)): break n_samples = len(training_pfds) #training_pfds = training_pfds.reshape((n_samples, -1)) #classifier = svm.SVC(gamma=0.1, scale_C=False) arglists.append([classifier, training_pfds, training_target, test_pfds, test_target]) #classifier.fit(training_pfds, training_target) def getF1(clf, training_pfds, training_target, test_pfds, test_target): clf.fit(training_pfds, training_target) F1 = singleclass_score(clf, test_pfds, test_target, verbose=verbose) return F1 if not nclasses == 2: raise "not yet implemented multiclass_score" #F1 = multiclass_score(classifier, test_pfds, test_target, #nclasses = nclasses, verbose=verbose) else: #F1 = singleclass_score(classifier, test_pfds, test_target, verbose=verbose) #if classifier.__dict__.has_key('strategy'): #F1dict = dict([(i,getF1(*al))for i,al in enumerate(arglists)]) from ubc_AI.threadit import threadit if len(arglists) >= 12: F1dict = threadit(getF1, arglists) else: F1dict = dict([(i,getF1(*al))for i,al in enumerate(arglists)]) #scores = np.append(scores, F1) #print F1dict scores = np.array([F1dict[i] for i in F1dict]) return scores
def threadpredict_proba(AIlist, pfds): """ Args: AIlist : list of trained classifiers pfds : list of pfds """ def predict_prob(clf): #try: p = clf.predict_proba(pfds) #except: #print 'Alarm!!!' return p resultdict = threadit(predict_prob, [[clf] for clf in AIlist]) return np.hstack([resultdict[n] for n in range(len(AIlist))])
def fit(self, pfds, target, **kwds): """ args: [list of pfd instances], target Notes: following advice from http://en.wikipedia.org/wiki/Ensemble_learning we train each classifier on a subset of the training data """ if target.ndim == 1: psrtarget = target else: psrtarget = target[..., 0] if not InteractivePy: #extract pfd features beforehand extractfeatures(self.list_of_AIs, pfds) input_data = [] for n, clf in enumerate(self.list_of_AIs): tr_pfds, tr_target, te_pfds, te_target = split_data(pfds, target, pct=0.75) if InteractivePy: clf.fit(tr_pfds, tr_target, **kwds) else: input_data.append([clf, tr_pfds, tr_target, kwds]) def threadfit(clf, tr_pfds, tr_target, kwds): clf.fit(tr_pfds, tr_target, **kwds) return clf if not InteractivePy: resultdict = threadit(threadfit, input_data) for n, clf in resultdict.iteritems(): self.list_of_AIs[n] = clf self.nclasses = len(np.unique(target)) if self.nclasses > 2 and self.strategy == 'adaboost': print "Warning, adaboost only works in 2-class systems" print "Reverting to Logistic Regression on the prediction matrix" self.strategy = 'lr' self.AIonAI = linear_model.LogisticRegression(penalty='l1') #train the AIonAI if used if (self.strategy in self.AIonAIs): if self.strategy not in self.req_predict: #use predict_prob if InteractivePy or (len(pfds) < 5 * num_workers): predictions = np.hstack([clf.predict_proba(pfds)\ for clf in self.list_of_AIs]) #nsamples x (npred x nclasses) #print predictions.shape else: predictions = threadpredict_proba(self.list_of_AIs, pfds) else: #use predict if InteractivePy or (len(pfds) < 5 * num_workers): predictions = np.transpose([clf.predict(pfds)\ for clf in self.list_of_AIs]) #nsamples x npred else: predictions = threadpredict(self.list_of_AIs, pfds) predictions = np.array(predictions) #nsamples x npred self.AIonAI.fit(predictions, psrtarget)
def fit(self, pfds, target, **kwds): """ args: [list of pfd instances], target Notes: following advice from http://en.wikipedia.org/wiki/Ensemble_learning we train each classifier on a subset of the training data """ if target.ndim == 1: psrtarget = target else: psrtarget = target[...,0] if not InteractivePy: #extract pfd features beforehand extractfeatures(self.list_of_AIs, pfds) input_data = [] for n, clf in enumerate(self.list_of_AIs): tr_pfds, tr_target, te_pfds, te_target = split_data(pfds, target, pct=0.75) if InteractivePy: clf.fit(tr_pfds, tr_target, **kwds) else: input_data.append([clf, tr_pfds, tr_target, kwds]) def threadfit(clf, tr_pfds, tr_target, kwds): clf.fit(tr_pfds, tr_target, **kwds) return clf if not InteractivePy: resultdict = threadit(threadfit, input_data) for n, clf in resultdict.iteritems(): self.list_of_AIs[n] = clf self.nclasses = len(np.unique(target)) if self.nclasses > 2 and self.strategy == 'adaboost': print "Warning, adaboost only works in 2-class systems" print "Reverting to Logistic Regression on the prediction matrix" self.strategy = 'lr' self.AIonAI = linear_model.LogisticRegression(penalty='l1') #train the AIonAI if used if (self.strategy in self.AIonAIs): if self.strategy not in self.req_predict: #use predict_prob if InteractivePy or (len(pfds) < 5*num_workers): predictions = np.hstack([clf.predict_proba(pfds)\ for clf in self.list_of_AIs]) #nsamples x (npred x nclasses) #print predictions.shape else: predictions = threadpredict_proba(self.list_of_AIs, pfds) else: #use predict if InteractivePy or (len(pfds) < 5*num_workers): predictions = np.transpose([clf.predict(pfds)\ for clf in self.list_of_AIs]) #nsamples x npred else: predictions = threadpredict(self.list_of_AIs, pfds) predictions = np.array(predictions) #nsamples x npred self.AIonAI.fit(predictions, psrtarget)