def deserialize_multinomial_nb(model_dict): model = MultinomialNB(model_dict['params']) model.classes_ = np.array(model_dict['classes_']) model.class_count_ = np.array(model_dict['class_count_']) model.class_log_prior_ = np.array(model_dict['class_log_prior_']) model.feature_count_ = np.array(model_dict['feature_count_']) model.feature_log_prob_ = np.array(model_dict['feature_log_prob_']) return model
def _naive_bayes_predict(table, model, suffix, display_log_prob=False, prediction_col='prediction', prob_prefix='probability', log_prob_prefix='log_probability'): if 'features' in model: feature_cols = model['features'] else: feature_cols = model['feature_cols'] feature_names, features = check_col_type(table, feature_cols) if 'nb_model' in model: nb_model = model['nb_model'] else: model_table = model['table_1'] if model_table.model_type[0] == 'multinomial': nb_model = MultinomialNB() else: nb_model = BernoulliNB() nb_model.fit([[1]], [1]) nb_model.classes_ = np.array([0, 1]) nb_model.class_log_prior_ = model_table.pi.values nb_model.feature_log_prob_ = np.array(list(model_table.theta)) prediction = nb_model.predict(features) if 'label_encoder' in model: label_encoder = model['label_encoder'] prediction = label_encoder.inverse_transform(prediction) if suffix == 'label': suffixes = label_encoder.classes_ else: suffixes = range(0, len(label_encoder.classes_)) else: suffixes = [0, 1] prob = nb_model.predict_proba(features) prob_cols = ['{prefix}_{suffix}'.format(prefix=prob_prefix, suffix=suffix) for suffix in suffixes] prob_df = pd.DataFrame(data=prob, columns=prob_cols) result = table result[prediction_col] = prediction if display_log_prob == True: log_prob = nb_model.predict_log_proba(features) logprob_cols = ['{prefix}_{suffix}'.format(prefix=log_prob_prefix, suffix=suffix) for suffix in suffixes] logprob_df = pd.DataFrame(data=log_prob, columns=logprob_cols) result = pd.concat([result, prob_df, logprob_df], axis=1) else: result = pd.concat([result, prob_df], axis=1) return {'out_table' : result}
def runEM(self): ''' initializes, then iteratively runs, the EM algorithm to cluster self.documents in self.n_category different classes ''' self.initializeEM(self.randomize) initial_ll = loglikelihood(self.models[-1], self.documents) self.likelihoods.append(initial_ll) print "EM initial likelihood: %s" % initial_ll for iter_n in range(self.max_iterations): done = False try: prev_likelihood = self.likelihoods[-1] except IndexError: prev_likelihood = -inf nb = MultinomialNB(**self.kwargs) # add faked "classes_" attribute to force it to think it's been trained nb.classes_ = np.ndarray((self.n_categories, )) # and add the random parameters to actually "train" it nb.class_log_prior_ = self.class_log_priors[-1] nb.feature_log_prob_ = self.feature_log_probs[-1] soft_predictions = self.e_step(nb) nb = self.m_step(soft_predictions) ### TODO: can speed up by a factor of two if i combine ll calculation and soft prediction ll = loglikelihood(nb, self.documents) self.models.append(nb) ### CHECK LIKELIHOOD CHANGE self.likelihoods.append(ll) if abs(float((ll - prev_likelihood)) / prev_likelihood) < LIKELIHOOD_EPSILON: done = True print "EM iteration %s of %s" % (iter_n, self.max_iterations), ll #print iter_n, ll, ll - prev_likelihood #, nb.count_classifications() #print iter_n, this_likelihood, count_live_classes(nb) if done: break