Пример #1
0
def main():

	# Load configuration from file
	config = Configuration(config_file='classify.conf')
	try:
		config.load_configuration()
		config_data = config.get_configuration()
	except:
		print "Error loading configuration file."
		print "Classifier aborting."
		raise 	
	
	#config.display_configuration()
	print config

	#sys.exit()
	
	myfolds = config.get_folds()
	correctness = 0

	#Preporcessor: tokenizer, stemmer, etc.
	prep_lower = config_data['lower']
	prep_stem = config_data['stem']
	prep_pos = config_data['pos']
	prep_ngram = config_data['ngram'] 
	prep = Preprocessor(pattern='\W+', lower=prep_lower, stem=prep_stem, pos=prep_pos, ngram=prep_ngram)

	for myfold in myfolds:
		ev = Evaluation(config=config, fold=myfold)
		if config_data['classifier'] == 'rocchio':
			ml = Rocchio(verbose=VERBOSE, fold=myfold, config=config, ev=ev)
		elif config_data['classifier'] == 'knn':
			ml = KNN(verbose=VERBOSE, fold=myfold, config=config, ev=ev)
		else:
			ml = NaiveBayes(verbose=VERBOSE, fold=myfold, config=config, ev=ev)
		training(config, myfold, ml, prep )
		ml.do_padding()
		ml.calculate_training_data()
		#r.display_idx()
		ml.diagnose()
		testing(config, myfold, ml, ev, prep)
		
		k = config_data['k']
		results = ev.calculate(review_spam=True, k=k)
		print 'Accuracy for fold %d: %s' % (myfold, results)

		correctness += results	

	print "\nAverage accuracy for all folds:", correctness / len(myfolds) 
Пример #2
0
def main():
    # Load configuration from file
    config = Configuration(config_file='/home/huma/Downloads/irlib-0.1.1/irlib/classify.conf')
    try:
        config.load_configuration()
        config_data = config.get_configuration()
    except:
        print("Error loading configuration file.")
        print("Classifier aborting.")
        raise

    # config.display_configuration()
    print(config)

    # sys.exit()

    myfolds = config.get_folds()
    correctness = 0

    # Preporcessor: tokenizer, stemmer, etc.
    prep_lower = config_data['lower']
    prep_stem = config_data['stem']
    prep_pos = config_data['pos']
    prep_ngram = config_data['ngram']
    prep = Preprocessor(pattern='\W+', lower=prep_lower, stem=prep_stem, pos=prep_pos, ngram=prep_ngram)

    for myfold in myfolds:
        ev = Evaluation(config=config, fold=myfold)
        if config_data['classifier'] == 'rocchio':
            ml = Rocchio(verbose=VERBOSE, fold=myfold, config=config, ev=ev)
        elif config_data['classifier'] == 'knn':
            ml = KNN(verbose=VERBOSE, fold=myfold, config=config, ev=ev)
        else:
            ml = NaiveBayes(verbose=VERBOSE, fold=myfold, config=config, ev=ev)
        training(config, myfold, ml, prep)
        ml.do_padding()
        ml.calculate_training_data()
        # r.display_idx()
        ml.diagnose()
        testing(config, myfold, ml, ev, prep)

        k = config_data['k']
        results = ev.calculate(review_spam=True, k=k)
        print('Accuracy for fold %d: %s' % (myfold, results))

        correctness += results

    print("Average accuracy for all folds:", correctness / len(myfolds))
Пример #3
0
	def training(self, first_n_files=10000):
		self.ev = Evaluation()
		self.ml = Rocchio(verbose=VERBOSE, fold='n/a', config=self.config, ev=self.ev)
		self.parse_files(fold="sgms/TRAIN.jdb", mode='training', first_n_files=first_n_files, ml=self.ml, config=self.config, prep=self.prep)
		self.ml.do_padding()
		self.ml.calculate_training_data()
		self.ml.diagnose()
Пример #4
0
class Classify():
	def __init__(self, printMessage=None):
		self.prep = SimpleAnalyzer(expression=r"[A-Za-z]*", gaps=False)
		self.printMessage = printMessage

	def setConfig(self):
		self.config = Configuration(config_file='classify.conf')
		try:
			self.config.load_configuration()
			config_data = self.config.get_configuration()
		except:
			self.printMessage("Error loading configuration file.")
			self.printMessage("Classifier aborting.")
			raise 	
		
		# config.display_configuration()
		self.printMessage(self.config)

	# Parse not any more than the first_n_files in folder
	# @ml: Object for our classifier class (Rocchio, kNN, etc)
	# @config: Our configuration class (class as in OOP not ML)
	# @prep: Preprocessor class; tokenizers, stemmers, etc.
	def parse_files(self, fold=1, mode = "training", first_n_files = 500, ml=object, config=object, prep=object):
		config_data = config.get_configuration()

		fd = open(fold, 'r')
		file_data = fd.read()
		objs = json.loads(file_data)
		counter = 0;
		length = len(objs)
		if length > first_n_files :
			length = first_n_files
		self.printMessage("%s data length: %d" % (mode,length))
		for i in objs:
			if counter > first_n_files:
				break
			counter += 1
			if counter % 100 == 0:
				self.printMessage(counter)
			doc_id = objs[i][unicode('id')]
			# terms = prep.ngram_tokenizer(text=objs[i][unicode('content')])
			terms = [token.text for token in prep(objs[i][unicode('content')]) if token.text != "" ]
			# self.printMessage(terms)

			for class_name in objs[i][unicode('topics')]:
				if mode == 'training':
					ml.add_doc(doc_id = doc_id, doc_class=class_name, doc_terms=terms)
				else:
					# Class known from filename
					ml.add_query(query_id = doc_id, query_class=class_name, query_terms=terms)	
		fd.close()

	# Let's do some workout now on all folders but one
	def training(self, first_n_files=10000):
		self.ev = Evaluation()
		self.ml = Rocchio(verbose=VERBOSE, fold='n/a', config=self.config, ev=self.ev)
		self.parse_files(fold="sgms/TRAIN.jdb", mode='training', first_n_files=first_n_files, ml=self.ml, config=self.config, prep=self.prep)
		self.ml.do_padding()
		self.ml.calculate_training_data()
		self.ml.diagnose()
		
	# Let's test on the remaining folder
	def testing(self, first_n_files=10000):
		self.parse_files(fold="sgms/TEST.jdb", mode='testing', first_n_files=first_n_files, ml=self.ml, config=self.config, prep=self.prep)
		self.ml.compare_queries()

	def result(self):
		results = self.ev.calculate(review_spam=True, k=self.config.get_configuration()['k'])
		self.printMessage(results)