def main(output_file=time.strftime('%h%d-%Hh%Mm')+'.csv', in_pkl=None): """ Generates features and fits classifier. Input command line argument is optional run name, defaults to date/time. """ logging.info("Loading features...") if not in_pkl: return "input .plk required" trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(in_pkl) logging.info("Loaded features, fitting model...") # Bernoulli Naive Bayes clf = BernoulliNB(alpha=1.0, binarize=None, fit_prior=True) clf.fit(trainFeatures,trainTargets) logging.info("Predicting...") # Use probabilities instead of binary class prediction in order to generate a ranking predicted_scores = clf.predict_log_proba(testFeatures).T[1] logging.info("Write results...") logging.info("Writing submission to %s" % output_file) f = open(output_file, "w") f.write("id\n") for pred_score, item_id in sorted(zip(predicted_scores, testItemIds), reverse = True): # only writes item_id per output spec, but may want to look at predicted_scores f.write("%d\n" % (item_id)) f.close() logging.info("Done.")
class NYTClassifier(object): def __init__(self, stopwords_file, articles): with open(stopwords_file) as f: self.stopwords = [x.strip() for x in f] with open(articles) as f: reader = csv.reader(f, delimiter = '\t') bodies = [] labels = [] self.urls = [] for url, title, body, section in reader: self.urls.append(url) bodies.append(body) labels.append(section) self.le = LabelEncoder() self.le.fit(labels) self.labels = self.le.transform(labels) self.cv = CountVectorizer( input='content', decode_error = 'replace', strip_accents = 'unicode', ngram_range=(1,1), analyzer = 'word', stop_words = self.stopwords, binary=True ) self.features = self.cv.fit_transform(bodies) def train(self): self.clf = BernoulliNB() self.clf.fit(self.features, self.labels) def predict(self, text): return self.le.inverse_transform(self.clf.predict(self.cv.transform(text))) def evaluate(self): return cross_validation.cross_val_score(self.clf, self.features, self.labels, cv=2, scoring='f1') def hard_to_predict(self): log_probs = self.clf.predict_log_proba(self.features) min_prob_diffs = [min([abs(a-b) for a,b in itertools.combinations(x, 2)]) for x in log_probs] joined = dict(zip(self.urls, min_prob_diffs)) return sorted(joined, key = joined.get)[:10]
# 星期几 和 街区 作为分类器输入特征 features = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'BAYVIEW', 'CENTRAL', 'INGLESIDE', 'MISSION', 'NORTHERN', 'PARK', 'RICHMOND', 'SOUTHERN', 'TARAVAL', 'TENDERLOIN'] hourFea = [x for x in range(24)] features = features + hourFea # 分割训练集(3/5) 和 测试集(2/5) training, validation = train_test_split(trainData, train_size=.60) # 朴素贝叶斯建模, 计算 log_loss model = BernoulliNB() nbStart = time.time() model.fit(training[features], training['crime']) # 训练 #### test data 没有用 nbTrainTime = time.time() - nbStart predict = np.array(model.predict_log_proba(validation[features])) # validation print("naive bayes 建模时间 %f s." % nbTrainTime) print("naive bayes log 损失为 %f." % (log_loss(validation['crime'], predict))) # 逻辑回归建模,计算log_loss model = LogisticRegression(C=.01) lrStart = time.time() model.fit(training[features], training['crime']) lrCostTime = time.time() - lrStart predicted = np.array(model.predict_proba(validation[features])) log_loss(validation['crime'], predicted) print("逻辑回归建模耗时 %f 秒" % lrCostTime) print("逻辑回归log损失为 %f" % (log_loss(validation['crime'], predicted)))
train_idx = IDX_DATA[predicate_name][trials]['train'] preamble = 'predicate_name=%s trials=%d featset_name=%s ' % ( predicate_name, trials, featset_name) cls = BernoulliNB(alpha=1, binarize=0.0, fit_prior=False) if featset_name != 'random': train_y = (np.array(labels[train_idx].todense()).squeeze() > 0).astype('int') train_x, feat_name = get_train_x(featset_name, predicate_name, predicate_idx, train_idx, train_y) cls.fit(train_x[train_idx], train_y) logprob = [ (i, e[1]) for i, e in enumerate(cls.predict_log_proba(train_x)) ] random.shuffle(logprob) else: logprob = list(enumerate(np.random.rand(total_persons))) _testing_output = [[ (1 if i in set_I else 0), i ] for i, _ in sorted(logprob, key=lambda x: x[1], reverse=True) if i not in train_idx] testing_output = [e[0] for e in _testing_output] # In order to debug the performance of naive bayes we need to # find out the features that were presented at train time. # And then the features that the NB classifier rated highly at # the test time. The question is that why were the false # positives rated so highly? print preamble,
mol_weights.append(weights) # normalization mol_weights = getNormalizedWeights(mol_weights) # draw similarity maps generateSimilarityMaps(mols, mol_weights, 'rf') ### NAIVE BAYES WITH MORGAN2 print "generate naive bayes similarity maps" # train random forest nb = BernoulliNB() nb.fit(training_fps, training_labels) # calculate weights mol_weights = [] for i,m in enumerate(mols): weights = [] orig_pp = nb.predict_log_proba(fps_morgan2[i])[0][1] # get bits for each atom bitmap = [~DataStructs.ExplicitBitVect(1024) for x in range(m.GetNumAtoms())] for bit, es in info_morgan2[i].iteritems(): for at1, rad in es: if rad == 0: # for radius 0 bitmap[at1][bit] = 0 else: # for radii > 0 env = Chem.FindAtomEnvironmentOfRadiusN(m, rad, at1) amap = {} submol = Chem.PathToSubmol(m, env, atomMap=amap) for at2 in amap.keys(): bitmap[at2][bit] = 0 # loop over atoms for at1 in range(m.GetNumAtoms()): new_fp = fps_morgan2[i] & bitmap[at1]
x = np.array([[1, 2, 3, 4], [1, 3, 4, 4], [2, 4, 5, 5], [5, 6, 9, 8]]) y = np.array([1, 1, 2, 3]) #核心代码 clf = BernoulliNB(alpha=2.0, binarize=3.0, fit_prior=True) clf.fit(x, y) ''' #训练后学习模型中的参数 print(np.log(2/4)) print(np.log(1/4)) print(np.log(1/4)) print(clf.class_log_prior_) #对比上面,这是先验概率对数值,类先验概率等于各类的个数/类的总个数 print(clf.feature_log_prob_ ) #指定类的各特征概率(条件概率)对数值 print(clf.class_count_) #按类别顺序输出其对应的个数 print(clf.feature_count_) #各类别各特征值之和,按类的顺序输出,返回形状为[n_classes, n_features] 的数组(不懂?) ''' #测试数据 x_test = [[1, 2, 2, 5], [7, 6, 10, 9]] #数据不能是分数 y_test_predict = clf.predict(x_test) y_predict_proba = clf.predict_proba(x_test) y_test_predict_log_proba = clf.predict_log_proba(x_test) print(y_test_predict) #在测试集x_test上预测,输出x_test对应目标值 print(y_predict_proba) #输出测试样本划分到各个类别的概率值 print(y_test_predict_log_proba) #输出测试样本划分到各个类别的概率值的对数 print(clf.score( [[3, 4, 5, 4], [1, 3, 5, 6]], [1, 3])) #输出对测试样本的预测准确率的平均值,当然可以加权值这个参数score(X, y, sample_weight=None)
class tokenClassifier: def __init__(self, trainingFile): self.trainingFile = trainingFile self.nb = BernoulliNB() self._train() """ Method which classifies a single token as a surname, forename, title, occupation, or address. :param vector: Feature vector of token to be classified :return: Predicted class of feature vector """ def classify(self, vector): vectorArray = numpy.array(vector).astype(numpy.int) predictedClass = self.nb.predict(vectorArray.reshape(1, -1)) if predictedClass == 1: return "SURNAME" elif predictedClass == 2: return "FORENAME" elif predictedClass == 3: return "TITLE" elif predictedClass == 4: return "OCCUPATION" elif predictedClass == 5: return "ADDRESS" """ Method which returns the log probabilities of a single token being in each class. :param vector: Feature vector of token :return: A list of the probabilities of being a surname, forename, title, occupation or address respectively """ def logProbabilities(self, vector): vectorArray = numpy.array(vector).astype(numpy.int) probabilities = self.nb.predict_log_proba(vectorArray.reshape(1, -1)).tolist()[0] return probabilities """ A private method for training the naive Bayes classifier. """ def _train(self): dataset = arffParser.parseFile(self.trainingFile) trainingValues = dataset[:,:13] trainingClasses = dataset[:,13] i = 0 for t in trainingClasses: t = t.replace("\'", "") if t == "SURNAME": trainingClasses[i] = 1 elif t == "FORENAME": trainingClasses[i] = 2 elif t == "TITLE": trainingClasses[i] = 3 elif t == "OCCUPATION": trainingClasses[i] = 4 elif t == "WORK_ADDRESS" or t == "HOME_ADDRESS" or t == "ADDRESS": trainingClasses[i] = 5 else: raise ValueError(t + " is not a valid class") i += 1 trainingValues = numpy.array(trainingValues).astype(numpy.int) trainingClasses = numpy.array(trainingClasses).astype(numpy.int) self.nb.fit(trainingValues, trainingClasses)
yt = encodeAsInt(yt) # print list(yt) #le = preprocessing.LabelEncoder() #le.fit(Yt) #yt = le.transform(Yt) # mnb = BernoulliNB(alpha=0.5,binarize=0.0) # mnb = BernoulliNB(alpha=0.1, binarize=1.0, class_prior=None, fit_prior=True) bnb = BernoulliNB(alpha=0.5, fit_prior=True) # mnb = BernoulliNB(alpha=0.1) bnb.fit(Xtrain, yt) print Xtrain.shape[1] prob1 = bnb.predict_log_proba(Xtrain)[1] deb = np.array([np.array([prob1[i],i]) for i in xrange(len(prob1))]) ytp = bnb.predict(Xtrain) # print [(yt[i],ytp[i]) for i in xrange(len(yt))] # print sum(yt!=ytp) print 'prediction accuracy: %.4f' % (1 - (1. / len(ytp) * sum( yt != ytp ))) #gt = lambda fd, bins: SimpleGoodTuringProbDist(fd, bins=1e5) #classifier = nltk.NaiveBayesClassifier.train(labeled_f,estimator=MLEProbDist) #estimator choices:ELEProbDist, LaplaceProbDist,LidstoneProbDist,MLEProbDist,ConditionalProbDist, #testDat = d[0:1000]
import numpy as np from sklearn.naive_bayes import BernoulliNB X = np.array([[1, 2, 3, 4], [1, 3, 4, 4], [2, 4, 5, 5]]) y = np.array([1, 1, 2]) clf = BernoulliNB(alpha=1, class_prior=None, binarize=2.0, fit_prior=False) clf.fit(X, y, sample_weight=None) #训练样本,X表示特征向量,y类标记,sample_weight表各样本权重数组 print(clf.class_log_prior_) print(X) #class_log_prior_:各类标记的平滑先验概率对数值,其取值会受fit_prior和class_prior参数的影响,三种情况 #若指定了class_prior参数,不管fit_prior为True或False,class_log_prior_取值是class_prior转换成log后的结果 #若fit_prior参数为False,class_prior=None,则各类标记的先验概率相同等于类标记总个数N分之一 #若fit_prior参数为True,class_prior=None,则各类标记的先验概率相同等于各类标记个数除以各类标记个数之和 print(clf.class_count_) #class_count_属性:获取各类标记对应的训练样本数 print(clf.feature_count_) #:各类别各个特征出现的次数,返回形状为(n_classes, n_features)数组) print(clf.get_params(deep=True)) #get_params(deep=True):返回priors与其参数值组成字典 print(clf.predict_log_proba([[3, 4, 5, 4], [1, 3, 5, 6] ])) #predict_log_proba(X):输出测试样本在各个类标记上预测概率值对应对数值 print(clf.predict_proba([[3, 4, 5, 4], [1, 3, 5, 6]])) #predict_proba(X):输出测试样本在各个类标记预测概率值 print(clf.score([[3, 4, 5, 4], [1, 3, 5, 6]], [1, 1])) #score(X, y, sample_weight=None):输出对测试样本的预测准确率的平均值 clf.set_params(alpha=2.0) #set_params(**params):设置估计器参数 print(clf.get_params(deep=True))