Пример #1
0
    def classify(self, text, language):
	'''Given input text and language, method calculates probability of text being relevant to topic. @result probability that text is relevant'''
	input_entry = Entry(id=None, guid=None, entry=text, language=language)
	self.word_dict.words.setdefault(language, {})
	# for each token claculate probability of being relevant to topic
	# and calculate according to bayes theorem
	#
	#		  p1p2p3........pn		      a
	# P = ------------------------------------------ = -------
	#	p1p2p3........pn + (1-p1)(1-p2)...(1-pn)    a + b
	#
	a = 1.0
	b = 1.0
	for i in xrange(1, self.MAX_TOKEN_SIZE + 1):
	    for token in input_entry.get_token(i):
		if not token in self.word_dict.words[language]:
		    probability = 0.5
		else:
		    token_stats = self.word_dict.words[language][token]
		    probability = token_stats['weight'] / token_stats['count']
		a *= probability
		b *= 1 - probability

        if a + b == 0:
            return 0
        else:
            result = a / (a + b)
            if result == 0.5:
                return -1
            else:
                return a / (a + b)
Пример #2
0
    def classify(self, text, language):
        '''Given input text and language, method calculates probability of text being relevant to topic. @result probability that text is relevant'''
        input_entry = Entry(id=None, guid=None, entry=text, language=language)
        self.word_dict.words.setdefault(language, {})
        # for each token claculate probability of being relevant to topic
        # and calculate according to bayes theorem
        #
        #		  p1p2p3........pn		      a
        # P = ------------------------------------------ = -------
        #	p1p2p3........pn + (1-p1)(1-p2)...(1-pn)    a + b
        #
        a = 1.0
        b = 1.0
        for i in xrange(1, self.MAX_TOKEN_SIZE + 1):
            for token in input_entry.get_token(i):
                if not token in self.word_dict.words[language]:
                    probability = 0.5
                else:
                    token_stats = self.word_dict.words[language][token]
                    probability = token_stats['weight'] / token_stats['count']
                a *= probability
                b *= 1 - probability

        if a + b == 0:
            return 0
        else:
            result = a / (a + b)
            if result == 0.5:
                return -1
            else:
                return a / (a + b)