def classify(self, text, language): '''Given input text and language, method calculates probability of text being relevant to topic. @result probability that text is relevant''' input_entry = Entry(id=None, guid=None, entry=text, language=language) self.word_dict.words.setdefault(language, {}) # for each token claculate probability of being relevant to topic # and calculate according to bayes theorem # # p1p2p3........pn a # P = ------------------------------------------ = ------- # p1p2p3........pn + (1-p1)(1-p2)...(1-pn) a + b # a = 1.0 b = 1.0 for i in xrange(1, self.MAX_TOKEN_SIZE + 1): for token in input_entry.get_token(i): if not token in self.word_dict.words[language]: probability = 0.5 else: token_stats = self.word_dict.words[language][token] probability = token_stats['weight'] / token_stats['count'] a *= probability b *= 1 - probability if a + b == 0: return 0 else: result = a / (a + b) if result == 0.5: return -1 else: return a / (a + b)