def fit(self, text: Union[str, Iterable[str], Iterable[Data], pd.DataFrame], category: Union[str, Iterable[str]] = None) -> TextClassifier: '''learn probabilities for tokens extracted by the given text''' data = DataSet.FromAny(text, category) categories = [] tokens = {} values = [] for d in data: categories.append((d.category, d.score)) for token in d.tokens: tokens[token] = 1 values.append((d.table, d.score)) self.total_documents += 1 tokens = list(tokens) self.__add_category(categories) self.__add_token(tokens) data_values = [[1 if t in v[0] else 0 for t in tokens] + [v[1]] for v in values] tokens.append(Data.CATEGORY_NAME) data_values = pd.DataFrame(data_values, columns=tokens) self.model.fit(data_values, Data.CATEGORY_NAME) return self
def fit(self, text: Union[str, Iterable[str], Iterable[Data], pd.DataFrame], category: Union[str, Iterable[str]]=None) -> NaiveBayes: '''learn probabilities for tokens extracted by the given text''' data = DataSet.FromAny(text, category) for d in data: # ensure we have defined the c category self.__add_category(d.category) # update our count of how many documents mapped to this category self.documents[d.category] += 1 # update the total number of documents we have learned from self.total_documents += 1 # Update our vocabulary and our word frequency count for this category for token, frequency in d.table.items(): # add this word to our vocabulary if not already existing self.__add_token(token) # update the frequency information for this word in this category if token not in self.word_frequency[d.category]: self.word_frequency[d.category][token] = frequency else: self.word_frequency[d.category][token] += frequency # update the count of all words we have seen mapped to this category self.word_count[d.category] += frequency return self