Exemplo n.º 1
0
    def set_doctypes(self, doctype1, doctype2):
        if doctype1 == doctype2:
            raise ValueError("Please enter two different doctypes")

        d = Db().get_doctype_counts()
        if doctype1 not in d.keys():
            raise ValueError("Unknown doctype: " + doctype1)

        if doctype2 not in d.keys():
            raise ValueError("Unknown doctype: " + doctype2)

        self.doctype1 = doctype1
        self.doctype2 = doctype2
Exemplo n.º 2
0
    def execute(self):
        import os

        db = Db()
        result = {}
        count = 0
        positive = 0

        d = db.get_doctype_counts()
        self.doctype1_count = d.get(self.doctype1)
        self.doctype2_count = d.get(self.doctype2)

        self.doctype1_word_count = db.get_words_count(self.doctype1)
        self.doctype2_word_count = db.get_words_count(self.doctype2)

        names = os.listdir(self.path_name)

        fout = open(self.path_name.strip("/").split("/")[-1] + "_score.txt", "w")
        n = len(names)

        for name in names:
            f = os.path.join(self.path_name, name)
            self.set_file_name(f)

            pl = []
            for word in self.words:
                pw = self.p_for_word(db, word)
                pl.append(pw)

            p = self.p_from_list(pl)
            fout.write("%s %1.4f\n" % (name, p))

            count += 1
            tag = "F"
            if p > self.THRESHOLD:
                positive += 1
                tag = "T"

                # if abs(p - self.THRESHOLD) < 0.1:
                # 	print '[ %5d / %5d ] %s %16s : %1.4f' %(count, n, tag, name, p)

        fout.write("\nRESULT: [ %d / %d ] %1.2f%%\n" % (positive, count, 100 * positive / count))
        fout.close()

        result["count"] = count
        result["positive"] = positive

        return result
Exemplo n.º 3
0
	def execute(self):
		result = {}
		self.db = Db()
				
		try:
			nword = 0
			ndoc = 0

			if os.path.isdir(self.path_name):
				names = os.listdir(self.path_name)
				for name in names:
					f = os.path.join(self.path_name, name)
					nword += self.learn_file(f, 1)
					ndoc += 1
					if ndoc >= self.count:
						break
					
			if os.path.isfile(self.path_name):
				nword += self.learn_file(self.path_name, self.count)
				ndoc += self.count
						
		except:
			print 'learning unexception'

		result['ndoc'] = ndoc
		result['nword'] = nword

		self.db.store()
		
		return result
Exemplo n.º 4
0
	def __init__(self):
		self.db = Db()
Exemplo n.º 5
0
class Tfidf(Mode):
	MIN_WORD_COUNT = 5
	RARE_WORD_PROB = 0.5
	EXCLUSIVE_WORD_PROB = 0.99

	def __init__(self):
		self.db = Db()

	def set_text(self, text):
		words = text_to_list(text)

		if not len(words):
			raise ValueError('Text did not contain any valid words')

		self.words = words
		return self
	
	def set_path(self, path):
		self.path_name = path
		return self.path_name

	def set_file_name(self, file_name):
		try:
			self.file_name = file_name
			f = open(file_name, 'r')
			file_contents = f.read()
			f.close()
			return self.set_text(file_contents)
		
		except Exception as e:
			raise ValueError('Unable to read specified file "%s", the error message was: %s' % (file_name, e))
		
	def set_doctypes(self, doctype1, doctype2):
		if doctype1 == doctype2:
			raise ValueError('Please enter two different doctypes')

		d = self.db.get_doctype_counts()
		if doctype1 not in d.keys():
			raise ValueError('Unknown doctype: ' + doctype1)

		if doctype2 not in d.keys():
			raise ValueError('Unknown doctype: ' + doctype2)

		self.doctype1 = doctype1
		self.doctype2 = doctype2

	def validate(self, args):
		if len(args) != 5:
			raise ValueError('Usage: %s classify <file> <doctype> <doctype>' % args[0])

		self.set_path(args[2])
		self.set_doctypes(args[3], args[4])
		

	def tf_for_word(self, words, word, num_in_spam, num_in_ham):
		# words_set = set(words)
		db = self.db

		word_in_spam = db.get_word_count('spam', word)
		word_in_ham  = db.get_word_count('ham', word)
		
		tf = math.log(float(word_in_spam) / num_in_spam + 1)  - math.log(float(word_in_ham) / num_in_ham + 1) 
		return abs(tf)

	def idf_for_word(self, word):
		db = self.db
		dc = db.get_doctype_counts()
		num_docs_spam = dc.get('spam')
		num_docs_ham = dc.get('ham')
		term_num_docs_spam = db.get_word_doc_count('spam', word)
		term_num_docs_ham = db.get_word_doc_count('ham', word)
		
		return abs(math.log(float(1 + num_docs_spam) / (1 + term_num_docs_spam)) - \
		       math.log(float(1 + num_docs_ham)  / (1 + term_num_docs_ham)))


	def execute(self):
		import os
		
		db = self.db
		result = []
		count = 0
		positive = 0

		d = db.get_doctype_counts()
		self.doctype1_count = d.get(self.doctype1)
		self.doctype2_count = d.get(self.doctype2)

		self.doctype1_word_count = db.get_words_count(self.doctype1)
		self.doctype2_word_count = db.get_words_count(self.doctype2)

		names = os.listdir(self.path_name)

		fout = open(self.path_name.strip('/').split('/')[-1] + '_tfidf.txt', 'w')

		num_in_spam  = db.get_words_count('spam')
		num_in_ham   = db.get_words_count('ham')


		for name in names:
			fin = os.path.join(self.path_name, name)
			self.set_file_name(fin)
			f = open(self.path_name.strip('/').split('/')[-1] + '_' + name + '.tfidf', 'w')


			tfidf = {}
			for word in self.words:
				tf = self.tf_for_word(self.words, word, num_in_spam, num_in_ham)
				idf = self.idf_for_word(word)
				tfidf[word] = tf * idf

			result = sorted(tfidf.items(), key=itemgetter(1), reverse=True)
			
			n = int(math.log(len(result) + 1)) * 10
						
			# fout.write("%s %s\n\n" % (name, str(result[:n])) )

			for kw, v in result[:n]:
				f.write("%s " % (kw) )
				
			f.close()

		fout.close()

		return result
	
	def output(self, result):
		#print '\nRESULT: True %d, False %d\n' % (result['positive'], result['count'] - result['positive'])
		pass
Exemplo n.º 6
0
class Chi(Mode):
    MIN_WORD_COUNT = 5
    RARE_WORD_PROB = 0.5
    EXCLUSIVE_WORD_PROB = 0.99

    def __init__(self):
        self.db = Db()

    def set_text(self, text):
        words = text_to_list(text)

        if not len(words):
            raise ValueError("Text did not contain any valid words")

        self.words = words
        return self

    def set_path(self, path):
        self.path_name = path
        return self.path_name

    def set_file_name(self, file_name):
        try:
            self.file_name = file_name
            f = open(file_name, "r")
            file_contents = f.read()
            f.close()
            return self.set_text(file_contents)

        except Exception as e:
            raise ValueError('Unable to read specified file "%s", the error message was: %s' % (file_name, e))

    def set_doctypes(self, doctype1, doctype2):
        if doctype1 == doctype2:
            raise ValueError("Please enter two different doctypes")

        d = self.db.get_doctype_counts()
        if doctype1 not in d.keys():
            raise ValueError("Unknown doctype: " + doctype1)

        if doctype2 not in d.keys():
            raise ValueError("Unknown doctype: " + doctype2)

        self.doctype1 = doctype1
        self.doctype2 = doctype2

    def validate(self, args):
        if len(args) != 5:
            raise ValueError("Usage: %s classify <file> <doctype> <doctype>" % args[0])

        self.set_path(args[2])
        self.set_doctypes(args[3], args[4])

    def chi(self, word, num_spam, num_ham):
        A = self.db.get_word_doc_count("spam", word)
        B = self.db.get_word_doc_count("ham", word)
        C = num_spam - A
        D = num_ham - B
        if A * B == 0:
            chi = 0
        else:
            chi = float(pow((A * D - B * C), 2)) / ((A + B) * (C + D))
        return chi

    def execute(self):
        import os

        db = self.db
        result = []
        count = 0
        positive = 0

        d = db.get_doctype_counts()
        num_spam = d["spam"]
        num_ham = d["ham"]

        names = os.listdir(self.path_name)

        fout = open(self.path_name.strip("/").split("/")[-1] + "_chi.txt", "w")

        for name in names:
            fin = os.path.join(self.path_name, name)
            self.set_file_name(fin)
            f = open(self.path_name.strip("/").split("/")[-1] + "_" + name + ".chi", "w")

            chi = {}
            for word in self.words:
                chi[word] = self.chi(word, num_spam, num_ham)

            result = sorted(chi.items(), key=itemgetter(1), reverse=True)

            n = int(math.log(len(result) + 1)) * 10

            # fout.write("%s %s\n\n" % (name, str(result[:n])) )

            for kw, v in result[:n]:
                f.write("%s " % (kw))

            f.close()

        fout.close()

        return result

    def output(self, result):
        # print '\nRESULT: True %d, False %d\n' % (result['positive'], result['count'] - result['positive'])
        pass
Exemplo n.º 7
0
class Learn(Mode):
	def validate(self, args):
		valid_args = False
		usage = 'Usage: %s learn <doc type> <dir> <count>' % args[0]

		if len(args) == 5:
			doc_type = args[2]
			
			self.doc_type = doc_type

			self.path_name = args[3]
			try:
				self.count = int(args[4])
			except:
				raise ValueError(usage + '\nEnter an integer value for the "count" parameter')			

		else:
			raise ValueError(usage)


	def learn_file(self, name, count):
		file_contents = None
		words_count = 0
		db = self.db
		try:
			f = open(name, 'r')
			file_contents = f.read()
			f.close()

			l = text_to_list(file_contents)
			d = list_to_dict(l)
			words_count = db.update_word_counts(d, self.doc_type)
			db.update_doctype_count(count, self.doc_type)

			print '>> TRAINING [ %s ]: %5d words learned from "%s"' % (self.doc_type, words_count, name)


		except Exception as e:
			raise ValueError(usage + '\nUnable to read specified file "%s", the error message was: %s' % (args[3], e))

		return words_count

	def execute(self):
		result = {}
		self.db = Db()
				
		try:
			nword = 0
			ndoc = 0

			if os.path.isdir(self.path_name):
				names = os.listdir(self.path_name)
				for name in names:
					f = os.path.join(self.path_name, name)
					nword += self.learn_file(f, 1)
					ndoc += 1
					if ndoc >= self.count:
						break
					
			if os.path.isfile(self.path_name):
				nword += self.learn_file(self.path_name, self.count)
				ndoc += self.count
						
		except:
			print 'learning unexception'

		result['ndoc'] = ndoc
		result['nword'] = nword

		self.db.store()
		
		return result

	def output(self, result):
		print '>> TRAINING [ %s ]: %5d document(s), %5d words learned from "%s"' % (self.doc_type, result['ndoc'], result['nword'], self.path_name)