class Recommender(object): ''' A class to house the text vectorizer and stacked Naive Bayes/Random Forest Classifiers that form the heart of this wine recommender. ''' def __init__(self): self.nb = ComplementNB() self.rf = RandomForestClassifier() self.vecto = TfidfVectorizer() def _fit(self, data): ''' Takes in the data for the recommender to be trained and fit to. Parameters ---------- data - The filepath to the data being fit. Returns ---------- None ''' wrangler = Data_Handler(data) df = wrangler.get_top_num(15) X = df['description'] y = df['variety'] X = self.vecto.fit_transform(X) self.nb.fit(X, y) X = self.nb.predict_proba(X) self.rf.fit(X, y) def predict(self, text): ''' Takes in a single input of tasting notes and runs it through our vectorizer and ensemble method to return the top five predicted varieties. Parameters ---------- text - str - The input tastings notes. Returns ---------- top_five - lst - The top five predicted varieties for recommendation. ''' vect = self.vecto.transform([text]) probs = self.nb.predict_proba(vect) probs = self.rf.predict_proba(probs)[0] idx = np.argsort(probs) top_five_idx = idx[-1:-6:-1] top_five = self.rf.classes_[top_five_idx] return top_five
def findBestFitCluster(orphanCorpus, corpusCluster={}): """ Given a set of questions without a cluster and a set of other clusters, find the best cluster to put the orphaned questions Parameters: orphanCorpus (tagged_question_corpus.TaggedQuestionCorpus): corpus of the questions without a cluster. corpusCluster ({tagged_question_corpus.TaggedQuestionCorpus}): Object containing different clusters and their corpuses Returns: xxx """ # corpusCluster = { # "questions": [ 'and the moon too guys', 'lets show some or a lot of love for the moon!!' ], # "question_vectors": [[], []], # "clusterIds": [ '4', '4' ] # } # orphanCorpus = [ { # "id": 11, "question": 'Another one about the sun?', "question_vector": [] # }, # { # "id": 33, # "question": 'What is the distance from the sun though?', "question_vector": [] }, # { # "id": 37, # "question": 'what\'s the changing factors of the sun and moon together?', "question_vector": [] # } ] # Fit the Naive bayes model on existing clusters clf = ComplementNB() clf.fit(corpusCluster["question_vectors"], corpusCluster["clusterIds"]) predictions = clf.predict_proba( [doc["question_vector"] for doc in orphanCorpus])
class ComplementNBImpl(): def __init__(self, alpha=1.0, fit_prior=True, class_prior=None, norm=False): self._hyperparams = { 'alpha': alpha, 'fit_prior': fit_prior, 'class_prior': class_prior, 'norm': norm } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def predict(self, X): return self._sklearn_model.predict(X) def predict_proba(self, X): return self._sklearn_model.predict_proba(X)
def complement_bayes(x_train,x_test,y_train,y_test,X,fl,amostra_paci3,fl_a3,nome): Complement=ComplementNB() Complement.fit(x_train,y_train) pred=Complement.predict_proba(x_train) amostra_=Complement.predict_proba(amostra_paci3) amostra_2=Complement.predict(amostra_paci3) amostra_paci3['result']=0 amostra_paci3['probls']=0 amostra_paci3['probls']=amostra_ amostra_paci3['result']=amostra_2 amostra_paci3['fl_severidade']=fl_a3 amostra_paci3.to_csv('modelo_complement_bayes.csv') print('Treinamento AUC-ROC:{}'.format(roc_auc_score(y_train,pred[:,1]))) pred_2=Complement.predict_proba(x_test) print('Validacao AUC-ROC:{}'.format(roc_auc_score(y_test,pred_2[:,1]))) #print(Complement.predict_proba(X)) yhat = Complement.predict_proba(X) yhat = yhat[:, 1] print(pd.crosstab(fl, Complement.predict(X))) print(classification_report(fl, Complement.predict(X))) print('AUC: %0.2f' % roc_auc_score(fl,yhat)) plot_roc_curve(fl,yhat,nome)
class _ComplementNBImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X)
def cluster_by_keyword(keyword=None, questions=[], questionIds=[], analyzer="word"): """ Finds the keyword question pairs using complement naive bayes algorithm by first tfidf vectorizing the list of questions (corpus) and trying to find the questions with the highest likelihood of belonging to the keyword cluster. Parameters: keyword (string): A string title of the cluster we are trying to look for questions (list(string)): The list of the question string text questionIds (list(number)): A list containing the ids of the questions passed as the questions parameter. These Ids are what is returned per identified cluster. analyser (string): Either "word", "char", or "char_wb" - Use word for identifying whole words and drop potential support for spelling errors, but support stop words and finding exact words. - Use char to identify ngrams based on a sequence of characters, does not support stop words and can return less than ideal clusters compositions, but supports spelling mistakes and incomplete keywords - Use char_wb to have character level ngrams that do not go beyond word boundaries. Extra padding is added to last characters of words when making ngrams. Returns: list of cluster tupples (list[(qnId, probability)]): List of tuples containing question Ids and the probability of the question belonging to this cluster threshold (number): The dynamic threshold that was used as a cut off point. """ keyword_len = len(keyword) vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.4, smooth_idf=True, stop_words="english", ngram_range=(1, 3), analyzer=analyzer, use_idf=True, lowercase=True) vectorized_questions = vectorizer.fit_transform(questions) cnb_clf = ComplementNB(alpha=1, fit_prior=False) cnb_clf.fit(vectorized_questions, questionIds) keyword = vectorizer.transform([keyword]) probabilities = cnb_clf.predict_proba(keyword) # threshold = np.var(probabilities) + np.median(probabilities) threshold = np.std(probabilities) + np.median(probabilities) + np.var( probabilities) return [(questionIds[ix], p) for ix, p in enumerate(probabilities[0]) if p > threshold], threshold
def code_from_feedbacks(inputs, clsf, lng, level): # does the same stuff as in 'code' inputs = prepare_input(inputs, lng) tf = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=0, sublinear_tf=True) # filter data train = Feedback.objects.filter(classification=clsf, level=level, language=lng) # if no feedbacks...return empty array of arrays if len(train) < 2: return [[] for i in inputs] train_text = [unidecode(t.text) for t in train] train_codes = [t.code for t in train] X = tf.fit_transform(train_text) # finally, model # complement naive bayes model = ComplementNB() model.fit(X, train_codes) inputs = [unidecode(i) for i in inputs] inputs_tf = tf.transform(inputs) output = model.predict(inputs_tf) probs = model.predict_proba(inputs_tf) output2 = [[out] for out in output] for i in range(0, len(inputs)): dif = max(probs[i]) - min(probs[i]) if (dif == 0): output2[i] = [] return output2
#text = 'The movie has a pleasant start, and that is all to it. It has such an ordinary story line that you can predict the next scene. It just getting more silly and stupid with time. By the end of the watch, you will realise what a complete waste of time it was.It is such a dumb watch. Well, it was a yawn-inducing, bland and senseless movie. Not recommended. One-star from me.'#negative #text = 'The movie had a very original start and it was comparative slow. But in my opinion, it had a thought-provoking idea. It gets uninteresting and tiresome in the middle, and a little predictable. But on the whole, it was unpretentious and tender. 3-star from me.'#neutral text = text.lower() text = word_tokenize(text) stop_words = set(stopwords.words("english")) wordnet_lemmatizer = WordNetLemmatizer() text = [ wordnet_lemmatizer.lemmatize(word, pos="v") for word in text if not word in stop_words and not word in string.punctuation ] print(text) x_test = vectorizer.transform(text) y_pred = model.predict(x_test) label = model.classes_ prob = model.predict_proba(x_test) prob = [True if np.amax(ele) > 0.60 else False for ele in prob] y_pred1 = [y_pred[i] if ele else '' for i, ele in enumerate(prob)] print(y_pred1) y_pred = [y_pred[i] for i, ele in enumerate(prob) if ele] y_pred = [y_pred.count(ele) for ele in label] print(label) print(y_pred) y_pred = y_pred - np.amax(y_pred) y_pred = [True if ele >= 0 else False for ele in y_pred] print(y_pred) if sum(y_pred) == 1: result = [label[i] for i, ele in enumerate(y_pred) if ele] else: result = 'Neutral' print(result)
class Classifier: def __init__(self, max_df=0.80, max_features=6500): self.count_vect = TfidfVectorizer(max_df=max_df, stop_words='english', max_features=max_features, use_idf=True) self.cnb = ComplementNB() np.random.seed(2222) def __fit(self): self.cnb.fit(self.x_train, self.train_set['category']) # Calling this method just after object creation is required in order to set up data # Attribute test_size specifies the magnitude of the test set def set_data(self, dataset: pd.DataFrame, labels: list, test_size=0.25): self.train_set, self.test_set = train_test_split(dataset, test_size=test_size) self.x_train = self.count_vect.fit_transform(self.train_set['text']) self.labels = labels self.__fit() # This method returns the predicted label for the text provided def predict(self, text: str): txt = TextTools() text = txt.preprocess(text) feats = self.count_vect.transform([text]) return self.cnb.predict(feats) # This method returns a matrix of probabilities computet by Complement Naive Bayes def get_predict_proba(self, text: str): feats = self.count_vect.transform([text]) predictions = { 'label': (self.cnb.predict(feats))[0], 'features': self.cnb.predict_proba(feats) } return predictions # This method returns the f1-score def get_score(self): x_test = self.count_vect.transform(self.test_set['text']) y_test_pred = self.cnb.predict(x_test) return f1_score(self.test_set['category'], y_test_pred, average=None, labels=self.labels).mean() # This method plots the confusion matrix def get_cmatrix(self): x_test = self.count_vect.transform(self.test_set['text']) y_test_pred = self.cnb.predict(x_test) disp = plot_confusion_matrix(self.cnb, x_test, self.test_set['category'], display_labels=self.labels, cmap=plt.cm.Blues, normalize='true') plt.show() # This method computes the cosine similarity between item1 and item2 # item[1,2] must be array-like def similarity(self, item1, item2): return cosine(item1, item2)
class ProbabilisticValidator(): """ # The probabilistic validator is a quick to train model used for validating the predictions of our main model # It is fit to the results our model gets on the validation set """ _smoothing_factor = 0.5 # TODO: Autodetermine smotthing factor depending on the info we know about the dataset _probabilistic_model = None _X_buff = None _Y_buff = None def __init__(self, col_stats, data_type=None): """ Chose the algorithm to use for the rest of the model As of right now we go with ComplementNB """ self._X_buff = [] self._Y_buff = [] self._predicted_buckets_buff = [] self._real_buckets_buff = [] self._original_real_buckets_buff = [] self._original_predicted_buckets_buff = [] self.col_stats = col_stats if 'percentage_buckets' in col_stats: self._probabilistic_model = MultinomialNB( alpha=self._smoothing_factor) self.buckets = col_stats['percentage_buckets'] self.bucket_keys = [i for i in range(len(self.buckets))] if len(self.buckets) < 3: self._probabilistic_model = ComplementNB( alpha=self._smoothing_factor) else: self._probabilistic_model = ComplementNB( alpha=self._smoothing_factor) self.buckets = None self.data_type = col_stats['data_type'] self.bucket_accuracy = {} def register_observation(self, features_existence, real_value, predicted_value, is_original_data=False, hmd=None): """ # Register an observation in the validator's internal buffers :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists) :param real_value: The real value/label for this prediction :param predicted_value: The predicted value/label :param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value` """ try: predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float( predicted_value) except: predicted_value = None try: real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float( str(real_value).replace(',', '.')) except: real_value = None if self.buckets is not None: predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats, hmd) real_value_b = get_value_bucket(real_value, self.buckets, self.col_stats, hmd) X = [False] * (len(self.buckets) + 1) X[predicted_value_b] = True X = X + features_existence self._X_buff.append(X) self._Y_buff.append(real_value_b) self._real_buckets_buff = self._Y_buff self._predicted_buckets_buff.append(predicted_value_b) if is_original_data: self._original_real_buckets_buff.append(real_value_b) self._original_predicted_buckets_buff.append(predicted_value_b) # If no column is ignored, compute the accuracy for this bucket nr_missing_features = len( [x for x in features_existence if x in (False, 0)]) if nr_missing_features == 0: if real_value_b not in self.bucket_accuracy: self.bucket_accuracy[real_value_b] = [] self.bucket_accuracy[real_value_b].append( int(real_value_b == predicted_value_b)) else: predicted_value_b = predicted_value real_value_b = real_value self._X_buff.append(features_existence) self._Y_buff.append(real_value_b == predicted_value_b) self._real_buckets_buff.append(real_value_b) self._predicted_buckets_buff.append(predicted_value_b) if is_original_data: self._original_real_buckets_buff.append(real_value_b) self._original_predicted_buckets_buff.append(predicted_value_b) def get_accuracy_histogram(self): x = [] y = [] total_correct = 0 total_vals = 0 buckets_with_no_observations = [] for bucket in range(len(self.buckets)): try: total_correct += sum(self.bucket_accuracy[bucket]) total_vals += len(self.bucket_accuracy[bucket]) y.append( sum(self.bucket_accuracy[bucket]) / len(self.bucket_accuracy[bucket])) except: # If no observations were made for this bucket buckets_with_no_observations.append(bucket) y.append(None) x.append(bucket) validation_set_accuracy = total_correct / total_vals for bucket in buckets_with_no_observations: y[x.index(bucket)] = validation_set_accuracy return {'buckets': x, 'accuracies': y}, validation_set_accuracy def partial_fit(self): """ # Fit the probabilistic validator on all observations recorder that haven't been taken into account yet """ log_types = np.seterr() np.seterr(divide='ignore') if self.buckets is not None: self._probabilistic_model.partial_fit(self._X_buff, self._Y_buff, classes=self.bucket_keys) else: self._probabilistic_model.partial_fit(self._X_buff, self._Y_buff, classes=[True, False]) np.seterr(divide=log_types['divide']) self._X_buff = [] self._Y_buff = [] def fit(self): """ # Fit the probabilistic validator on all observations recorder that haven't been taken into account yet """ log_types = np.seterr() np.seterr(divide='ignore') self._probabilistic_model.fit(self._X_buff, self._Y_buff) np.seterr(divide=log_types['divide']) self._X_buff = [] self._Y_buff = [] def get_confusion_matrix(self): # The rows represent predicted values # The "columns" represent real values labels = list(set(self._original_real_buckets_buff)) matrix = confusion_matrix(self._original_real_buckets_buff, self._original_predicted_buckets_buff, labels=labels) value_labels = [] for label in labels: try: value_labels.append(str(self.buckets[label])) except: value_labels.append('UNKNOWN') confusion_matrix_obj = { 'matrix': [[int(y) for y in x] for x in matrix], 'predicted': value_labels, 'real': value_labels } return confusion_matrix_obj def evaluate_prediction_accuracy(self, features_existence, predicted_value): """ # Fit the probabilistic validator on an observation :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists) :param predicted_value: The predicted value/label :return: The probability (from 0 to 1) of our prediction being accurate (within the same histogram bucket as the real value) """ if self.buckets is not None: predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats) X = [False] * (len(self.buckets) + 1) X[predicted_value_b] = True X = [X + features_existence] else: X = [features_existence] distribution = self._probabilistic_model.predict_proba(np.array(X))[0] distribution = distribution.tolist() if len([x for x in distribution if x > 0.01]) > 4: # @HACK mean = np.mean(distribution) std = np.std(distribution) distribution = [x if x > (mean - std) else 0 for x in distribution] sum_dist = sum(distribution) # Avoid divison by zero in certain edge cases sum_dist = 0.00001 if sum_dist == 0 else sum_dist distribution = [x / sum_dist for x in distribution] min_val = min([x for x in distribution if x > 0.001]) distribution = [ x - min_val if x > min_val else 0 for x in distribution ] sum_dist = sum(distribution) # Avoid divison by zero in certain edge cases sum_dist = 0.00001 if sum_dist == 0 else sum_dist distribution = [x / sum_dist for x in distribution] # @HACK else: pass return ProbabilityEvaluation(self.buckets, distribution, predicted_value)
r_train = encoded_r[train_index] module_train = df.MODULE.to_numpy()[train_index].reshape(-1, 1) desc_test = vectored_desc[test_index] r_test = encoded_r[test_index] module_test = df.MODULE.to_numpy()[test_index].reshape(-1, 1) # 训练NB DT: cnb = ComplementNB() cnb.fit(desc_train, module_train) dt = DecisionTreeClassifier() dt.fit(r_train, module_train) # 计算概率: desc_proba = cnb.predict_proba(desc_test) r_proba = dt.predict_proba(r_test) # 为了避免模块问题单出现频率对推荐结果造成影响,转换为0-1的向量,只做物理隔离: r_proba[r_proba > 0.000001] = 1 joint_proba = desc_proba * r_proba # 推荐top3: item_tree = predict_top_k(joint_proba, cnb.classes_) item_none_tree = predict_top_k(desc_proba, cnb.classes_) # 计算accuracy: acc1 = accuracy(module_test, item_tree) acc2 = accuracy(module_test, item_none_tree) accs_with_tree.append(acc1) accs_without_tree.append(acc2)
def evaluate_partitions(keep_bin_edges, df_processed): """ This function evaluates a lightweight classifier according to the thresholds. Inputs are a list of bin-edges for the continuous target and the processed df. """ # initialize the empty lists accs = [] aucs = [] mccs = [] apcs = [] accs_control = [] aucs_control = [] mccs_control = [] apcs_control = [] threshs = [] bin_pct = [] # starting data percentile pct = 0.0 # binning parameters fixed - DO NOT CHANGE num_bins = 10 num_trials = 10 # sweep through all bin edges for bin_edge in keep_bin_edges: threshold = bin_edge # obtain the X,y matrices X, X_control, y = partition_data(df_processed, threshold) # starting data percentile pct += 1 / num_bins for trial in range(num_trials): # get the training, testing, and control data-sets x_train_idf, y_train, x_test_idf, y_test, x_control_idf = split_transform_data( X, X_control, y) # fit the classifier clf = ComplementNB(alpha=0.1, class_prior=None, fit_prior=True, norm=False) clf.fit(x_train_idf, y_train) # evaluate on test and control sets accs.append(clf.score(x_test_idf, y_test)) accs_control.append(clf.score(x_control_idf, y)) y_pred = clf.predict(x_test_idf) y_pred_cont = clf.predict(x_control_idf) mccs.append(mcc(y_test, y_pred)) mccs_control.append(mcc(y, y_pred_cont)) y_proba = clf.predict_proba(x_test_idf) y_cont_proba = clf.predict_proba(x_control_idf) aucs.append(roc_auc_score(y_test, y_proba[:, 1])) aucs_control.append(roc_auc_score(y, y_cont_proba[:, 1])) apcs.append(apscore(y_test, y_proba[:, 1])) apcs_control.append(apscore(y, y_cont_proba[:, 1])) threshs.append(threshold) bin_pct.append(pct) # populate into a df for downstream analysis df_eval = pd.DataFrame() df_eval['data percentile'] = bin_pct # data percentile df_eval['threshold'] = threshs # bin edge df_eval['test accuracy'] = accs # accuracy df_eval['test mcc'] = mccs # matthews correlation coefficient df_eval['test auc'] = aucs # roc-auc df_eval['test ap'] = apcs # average precision df_eval['control accuracy'] = accs_control df_eval['control mcc'] = mccs_control df_eval['control auc'] = aucs_control df_eval['control ap'] = apcs_control return df_eval
return (np.nan) Confusion['F1'] = Confusion.apply(f1, axis=1) Scores = Confusion.loc[:, 'Recall':'F1'].mean(axis=0) RecallList.append(Scores[0]) PrecisionList.append(Scores[1]) F1List.append(Scores[2]) """Initiate NB classifier to generate cheap points""" model = ComplementNB() featureTrain = permuData.loc[InitialIndices, 'X1':'X12'].values labelTrain = permuData.loc[InitialIndices, 'Label'].values model.fit(featureTrain, labelTrain) """Generate Cheap points""" ExcludedIndices = np.setdiff1d(np.array(permuData.index), InitialIndices) CheapPoint = model.predict(permuData.loc[ExcludedIndices, 'X1':'X12']) Prob = model.predict_proba(permuData.loc[ExcludedIndices, 'X1':'X12']) permuData.loc[ExcludedIndices, 'PredictLabel'] = CheapPoint permuData.loc[ExcludedIndices, 'Plow':'Phigh'] = Prob Indicator.append(np.nan) """Looping repeat until meet the termination criteria""" for i in range(79): # Binning cheap points into three categories: HHpoint, Uncertain point, LHpoint Uncertain = permuData[(permuData.Plow >= 0.4) & (permuData.Plow <= 0.6)] UncertainList.append(len(Uncertain)) HclassH = permuData[(permuData.PredictLabel == 1) & (permuData.Phigh > 0.6)] HclassList.append(len(HclassH)) LclassH = permuData[(permuData.PredictLabel == 0)
class ProbabilisticValidator(): """ # The probabilistic validator is a quick to train model used for validating the predictions of our main model # It is fit to the results our model gets on the validation set """ _smoothing_factor = 0.5 # TODO: Autodetermine smotthing factor depending on the info we know about the dataset _value_bucket_probabilities = {} _probabilistic_model = None X_buff = None Y_buff = None def __init__(self, col_stats, data_type=None): """ Chose the algorithm to use for the rest of the model As of right now we go with ComplementNB """ # <--- Pick one of the 3 self._probabilistic_model = ComplementNB(alpha=self._smoothing_factor) #, class_prior=[0.5,0.5] #self._probabilistic_model = GaussianNB(var_smoothing=1) #self._probabilistic_model = MultinomialNB(alpha=self._smoothing_factor) self.X_buff = [] self.Y_buff = [] self.col_stats = col_stats if 'percentage_buckets' in col_stats: self.buckets = col_stats['percentage_buckets'] self.bucket_keys = [i for i in range(len(self.buckets))] else: self.buckets = None self.data_type = col_stats['data_type'] self.bucket_accuracy = { } def register_observation(self, features_existence, real_value, predicted_value): """ # Register an observation in the validator's internal buffers :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists) :param real_value: The real value/label for this prediction :param predicted_value: The predicted value/label :param histogram: The histogram for the predicted column, which allows us to bucketize the `predicted_value` and `real_value` """ nr_missing_features = len([x for x in features_existence if x is False or x is 0]) predicted_value = predicted_value if self.data_type != DATA_TYPES.NUMERIC else float(predicted_value) try: real_value = real_value if self.data_type != DATA_TYPES.NUMERIC else float(str(real_value).replace(',','.')) except: real_value = None if self.buckets is not None: predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats) real_value_b = get_value_bucket(real_value, self.buckets, self.col_stats) X = [False] * (len(self.buckets) + 1) X[predicted_value_b] = True X = X + features_existence self.X_buff.append(X) self.Y_buff.append(real_value_b) # If no column is ignored, compute the accuracy for this bucket if nr_missing_features == 0: if predicted_value_b not in self.bucket_accuracy: self.bucket_accuracy[predicted_value_b] = [] self.bucket_accuracy[predicted_value_b].append(int(real_value_b == predicted_value_b)) else: predicted_value_b = predicted_value real_value_b = real_value self.X_buff.append(features_existence) self.Y_buff.append(real_value_b == predicted_value_b) def get_accuracy_histogram(self): x = [] y = [] total_correct = 0 total_vals = 0 for bucket in self.bucket_accuracy: total_correct += sum(self.bucket_accuracy[bucket]) total_vals += len(self.bucket_accuracy[bucket]) x.append(bucket) y.append(sum(self.bucket_accuracy[bucket])/len(self.bucket_accuracy[bucket])) validation_set_accuracy = total_correct/total_vals return { 'buckets': x ,'accuracies': y }, validation_set_accuracy def partial_fit(self): """ # Fit the probabilistic validator on all observations recorder that haven't been taken into account yet """ log_types = np.seterr() np.seterr(divide='ignore') if self.buckets is not None: self._probabilistic_model.partial_fit(self.X_buff, self.Y_buff, classes=self.bucket_keys) else: self._probabilistic_model.partial_fit(self.X_buff, self.Y_buff, classes=[True, False]) np.seterr(divide=log_types['divide']) self.X_buff= [] self.Y_buff= [] def fit(self): """ # Fit the probabilistic validator on all observations recorder that haven't been taken into account yet """ log_types = np.seterr() np.seterr(divide='ignore') self._probabilistic_model.fit(self.X_buff, self.Y_buff) np.seterr(divide=log_types['divide']) self.X_buff= [] self.Y_buff= [] def evaluate_prediction_accuracy(self, features_existence, predicted_value): """ # Fit the probabilistic validator on an observation def evaluate_prediction_accuracy(self, features_existence, predicted_value): :param features_existence: A vector of 0 and 1 representing the existence of all the features (0 == not exists, 1 == exists) :param predicted_value: The predicted value/label :return: The probability (from 0 to 1) of our prediction being accurate (within the same histogram bucket as the real value) """ if self.buckets is not None: predicted_value_b = get_value_bucket(predicted_value, self.buckets, self.col_stats) X = [False] * (len(self.buckets) + 1) X[predicted_value_b] = True X = [X + features_existence] else: X = [features_existence] #X = [[predicted_value_b, *features_existence]] log_types = np.seterr() np.seterr(divide='ignore') distribution = self._probabilistic_model.predict_proba(np.array(X)) np.seterr(divide=log_types['divide']) if self.buckets is not None: return ProbabilityEvaluation(self.buckets, distribution[0].tolist(), predicted_value).most_likely_probability else: return distribution[0][1]
def code(inputs, clsf, lng, level): # UPDATE: 08 September 2020 -------------------------------------------- # Irina demanded that feedbacks should have priority in coding # to allow "faster learning" (which is basically not true technically) # this PACH is just to avoid further explanations feedback_outputs = code_from_feedbacks(inputs, clsf, lng, level) # --------------------------------------------------------------------- # keep original entry without modifications # needed for nltk dict fun later inputs_original_lng = inputs # training data file and training data # detect in which language the data should be try: rules = CodingRules.objects.get(classification=clsf) max_level = rules.max_level # language of training data languages = json.loads(rules.languages) if 'any' in languages: td_file_lng = languages['any'] else: td_file_lng = languages[lng] # transcode from another classification after coding # or use training data available for original if rules.recode_from != "this": later_trans_to = clsf clsf = rules.recode_from # cannot go deeper than max_level if level > max_level: level = max_level classification = Classification.objects.get(reference=clsf) except: # no given rule defined # this will result in an error return [] # list of codes corresponding to classification (in try) codes = Code.objects.filter(parent=classification) # loading only data that is in the requested language tdf = TrainingDataFile.objects.filter(classification=clsf, language=td_file_lng) # well, its possible that their is no data # for the selected classfication scheme :) if len(tdf) == 0: return [] # well, this is funny part # if lng of inputs is not equal to td_file_lng # defined in Coding_Rules, then it must be # translated to td_file_lng # this is how we avoid that coding for some # language would not work because of lack of data if lng != td_file_lng: from_lng = lng if lng == 'ge': from_lng = 'de' translator = translate.Translator(from_lang=from_lng, to_lang=td_file_lng) for i in range(0, len(inputs)): inputs[i] = translator.translate(inputs[i]) # 1. tokenization # 2. clean stop words # 3. lemmatize # all that defined in function prepare_input # that takes inputs and lng args provided here inputs = prepare_input(inputs, td_file_lng) # vectorizer to transform data into td-idf tf = TfidfVectorizer(analyzer="word", ngram_range=(1, 2), min_df=0, sublinear_tf=True) # filter data train = TrainingData.objects.filter(parent__in=tdf, level=level) train_text = [unidecode(t.text) for t in train] train_codes = [t.code for t in train] # training data collected through feedbacks feedbacks = Feedback.objects.filter(classification=clsf, language=td_file_lng, level=level) train_text = train_text + [fb.text for fb in feedbacks] train_codes = train_codes + [fb.code for fb in feedbacks] X = tf.fit_transform(train_text) # finally, model # complement naive bayes model = ComplementNB() model.fit(X, train_codes) inputs = [unidecode(i) for i in inputs] inputs_tf = tf.transform(inputs) output = model.predict(inputs_tf) # get probabilities of intput belonging to any class # append other likely predictions # then if necessary run dictionaries # dictionary only if max(prob)-min(prob) == 0 probs = model.predict_proba(inputs_tf) classes = model.classes_ inputs_retrial = extend_inputs_dict(inputs_original_lng, probs, lng, td_file_lng) if len(inputs_retrial) > 0: inputs_retrial = [unidecode(i) for i in inputs_retrial] inputs_retrial_tf = tf.transform(inputs_retrial) # now return predictions for those for which prob == 0 outputs_retrial = model.predict(inputs_retrial_tf) outputs_retrial = outputs_retrial.tolist() # here we remove those with probs == 0 # if even dictionary has zero prob then no need to keep false code probs_retrial = model.predict_proba(inputs_retrial_tf) for i in range(0, len(probs_retrial)): dif = max(probs_retrial[i]) - min(probs_retrial[i]) if dif == 0: outputs_retrial[i] = None for i in range(0, len(probs)): dif = max(probs[i]) - min(probs[i]) if dif == 0: output[i] = outputs_retrial.pop(0) # now if the training dataset was not # available for the classification against # which the data was coded # we must transcode it to that classification if rules.recode_from != 'this': try: crosswalk_file = CrosswalkFile.objects.get( classification_1=clsf, classification_2=later_trans_to) crosswalk = Crosswalk.objects.filter(parent=crosswalk_file) except: return [] # recode re_outputs = [] for code in output: recodes = crosswalk.filter(code_1=code) recodes = [recode.code_2 for recode in recodes] re_outputs.append(recodes) # add results obtained through coding from feedbacks (only! -> PATCH) for i in range(0, len(re_outputs)): re_outputs[i] = feedback_outputs[i] + re_outputs[i] return re_outputs output = [[out] for out in output] for i in range(0, len(output)): output[i] = feedback_outputs[i] + output[i] return output
def bayesClassify(trainX, trainY, testX): nbCLF = ComplementNB(alpha=44.5, norm=False) nbCLF.fit(np.absolute(trainX), trainY) return nbCLF.predict(testX), nbCLF.predict_proba(testX)
def complement_bayes(x_train, x_test, y_train, y_test, x_train_1, y_train_1, X, fl, amostra_paci3, fl_a3, nome): amostra_paci3_n = amostra_paci3.copy() x_train = preprocessing.normalize(x_train) x_train_1 = preprocessing.normalize(x_train_1) x_test = preprocessing.normalize(x_test) Complement = ComplementNB() Complement.fit(x_train, y_train) print(Complement.fit(x_train, y_train).feature_log_prob_) pred = Complement.predict_proba(x_train) amostra_ = Complement.predict_proba(amostra_paci3) amostra_2 = Complement.predict(amostra_paci3) amostra_paci3['result'] = 0 amostra_paci3['probls'] = 0 amostra_paci3['probls'] = amostra_ amostra_paci3['result'] = amostra_2 amostra_paci3['fl_severidade'] = fl_a3 amostra_paci3.to_csv('modelo_complement_bayes.csv') print('Treinamento AUC-ROC:{}'.format(roc_auc_score(y_train, pred[:, 1]))) pred_2 = Complement.predict_proba(x_test) print('Validacao AUC-ROC:{}'.format(roc_auc_score(y_test, pred_2[:, 1]))) yhat = Complement.predict_proba(x_train) yhat = yhat[:, 1] print(pd.crosstab(y_train, Complement.predict(x_train))) print(classification_report(y_train, Complement.predict(x_train))) print('AUC: %0.2f' % roc_auc_score(y_train, yhat)) plot_roc_curve(y_train, yhat, 'naive_train') yhat = Complement.predict_proba(x_test) yhat = yhat[:, 1] print(pd.crosstab(y_test, Complement.predict(x_test))) print(classification_report(y_test, Complement.predict(x_test))) print('AUC: %0.2f' % roc_auc_score(y_test, yhat)) plot_roc_curve(y_test, yhat, 'naive_test') print('verificação com down em treino') print() Complement = ComplementNB() Complement.fit(x_train_1, y_train_1) print(Complement.fit(x_train_1, y_train_1).feature_log_prob_) pred = Complement.predict_proba(x_train_1) amostra_ = Complement.predict_proba(amostra_paci3_n) amostra_2 = Complement.predict(amostra_paci3_n) amostra_paci3_n['result'] = 0 amostra_paci3_n['probls'] = 0 amostra_paci3_n['probls'] = amostra_ amostra_paci3_n['result'] = amostra_2 amostra_paci3_n['fl_severidade'] = fl_a3 amostra_paci3_n.to_csv('modelo_complement_bayes_1.csv') print('Treinamento AUC-ROC:{}'.format(roc_auc_score(y_train_1, pred[:, 1]))) pred_2 = Complement.predict_proba(x_test) print('Validacao AUC-ROC:{}'.format(roc_auc_score(y_test, pred_2[:, 1]))) yhat = Complement.predict_proba(x_train_1) yhat = yhat[:, 1] print(pd.crosstab(y_train_1, Complement.predict(x_train_1))) print(classification_report(y_train_1, Complement.predict(x_train_1))) print('AUC: %0.2f' % roc_auc_score(y_train_1, yhat)) plot_roc_curve(y_train_1, yhat, 'naive_train_1') yhat = Complement.predict_proba(x_test) yhat = yhat[:, 1] print(pd.crosstab(y_test, Complement.predict(x_test))) print(classification_report(y_test, Complement.predict(x_test))) print('AUC: %0.2f' % roc_auc_score(y_test, yhat)) plot_roc_curve(y_test, yhat, 'naive_test_1')
# describes info about train and test set print("Number of rows/columns in X_test dataset: ", X_test.shape) print("Number of rows/columns in y_test dataset: ", y_test.shape) print("Number of rows/columns in X_train dataset: ", X_train.shape) print("Number of rows/columns in y_train dataset: ", y_train.shape) # ## Fit the model # In[22]: # class sklearn.naive_bayes.ComplementNB(alpha=1.0, fit_prior=True, class_prior=None, norm=False) NB = ComplementNB() NB.fit(X_train, y_train) y_pred = NB.predict(X_test) probs = NB.predict_proba(X_test) probs = probs[:, 1] # ## Print the accuracy reports and confusion matrix # In[23]: from sklearn.metrics import classification_report, confusion_matrix, accuracy_score print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred)) print(accuracy_score(y_test, y_pred)) # # Dealing with unbalanced data: # # ## The data is unbalanced, indicted by two things:
accuracy_MNB_14 = metrics.accuracy_score( Y_test, BNB_model_ti_14.predict(x_test_vec_tfidf_14)) MNB_model_ti_16 = MultinomialNB().fit(x_train_vec_tfidf_16, Y_train) preds_MNB_tfidf_16 = MNB_model_ti_16.predict_proba(x_test_vec_tfidf_16) accuracy_MNB_16 = metrics.accuracy_score( Y_test, BNB_model_ti_16.predict(x_test_vec_tfidf_16)) MNB_model_ti_610 = MultinomialNB().fit(x_train_vec_tfidf_610, Y_train) preds_MNB_tfidf_610 = MNB_model_ti_610.predict_proba(x_test_vec_tfidf_610) accuracy_MNB_610 = metrics.accuracy_score( Y_test, BNB_model_ti_610.predict(x_test_vec_tfidf_610)) # Complement Naive Bayes models CNB_model_ti_12 = ComplementNB().fit(x_train_vec_tfidf_12, Y_train) preds_CNB_tfidf_12 = CNB_model_ti_12.predict_proba(x_test_vec_tfidf_12) accuracy_CNB_12 = metrics.accuracy_score( Y_test, CNB_model_ti_12.predict(x_test_vec_tfidf_12)) CNB_model_ti_14 = ComplementNB().fit(x_train_vec_tfidf_14, Y_train) preds_CNB_tfidf_14 = CNB_model_ti_14.predict_proba(x_test_vec_tfidf_14) accuracy_CNB_14 = metrics.accuracy_score( Y_test, CNB_model_ti_14.predict(x_test_vec_tfidf_14)) CNB_model_ti_16 = ComplementNB().fit(x_train_vec_tfidf_16, Y_train) preds_CNB_tfidf_16 = CNB_model_ti_16.predict_proba(x_test_vec_tfidf_16) accuracy_CNB_16 = metrics.accuracy_score( Y_test, CNB_model_ti_16.predict(x_test_vec_tfidf_16)) CNB_model_ti_610 = ComplementNB().fit(x_train_vec_tfidf_610, Y_train) preds_CNB_tfidf_610 = CNB_model_ti_610.predict_proba(x_test_vec_tfidf_610)