def main(): # Load configuration from file config = Configuration(config_file='classify.conf') try: config.load_configuration() config_data = config.get_configuration() except: print "Error loading configuration file." print "Classifier aborting." raise #config.display_configuration() print config #sys.exit() myfolds = config.get_folds() correctness = 0 #Preporcessor: tokenizer, stemmer, etc. prep_lower = config_data['lower'] prep_stem = config_data['stem'] prep_pos = config_data['pos'] prep_ngram = config_data['ngram'] prep = Preprocessor(pattern='\W+', lower=prep_lower, stem=prep_stem, pos=prep_pos, ngram=prep_ngram) for myfold in myfolds: ev = Evaluation(config=config, fold=myfold) if config_data['classifier'] == 'rocchio': ml = Rocchio(verbose=VERBOSE, fold=myfold, config=config, ev=ev) elif config_data['classifier'] == 'knn': ml = KNN(verbose=VERBOSE, fold=myfold, config=config, ev=ev) else: ml = NaiveBayes(verbose=VERBOSE, fold=myfold, config=config, ev=ev) training(config, myfold, ml, prep ) ml.do_padding() ml.calculate_training_data() #r.display_idx() ml.diagnose() testing(config, myfold, ml, ev, prep) k = config_data['k'] results = ev.calculate(review_spam=True, k=k) print 'Accuracy for fold %d: %s' % (myfold, results) correctness += results print "\nAverage accuracy for all folds:", correctness / len(myfolds)
def main(): # Load configuration from file config = Configuration(config_file='/home/huma/Downloads/irlib-0.1.1/irlib/classify.conf') try: config.load_configuration() config_data = config.get_configuration() except: print("Error loading configuration file.") print("Classifier aborting.") raise # config.display_configuration() print(config) # sys.exit() myfolds = config.get_folds() correctness = 0 # Preporcessor: tokenizer, stemmer, etc. prep_lower = config_data['lower'] prep_stem = config_data['stem'] prep_pos = config_data['pos'] prep_ngram = config_data['ngram'] prep = Preprocessor(pattern='\W+', lower=prep_lower, stem=prep_stem, pos=prep_pos, ngram=prep_ngram) for myfold in myfolds: ev = Evaluation(config=config, fold=myfold) if config_data['classifier'] == 'rocchio': ml = Rocchio(verbose=VERBOSE, fold=myfold, config=config, ev=ev) elif config_data['classifier'] == 'knn': ml = KNN(verbose=VERBOSE, fold=myfold, config=config, ev=ev) else: ml = NaiveBayes(verbose=VERBOSE, fold=myfold, config=config, ev=ev) training(config, myfold, ml, prep) ml.do_padding() ml.calculate_training_data() # r.display_idx() ml.diagnose() testing(config, myfold, ml, ev, prep) k = config_data['k'] results = ev.calculate(review_spam=True, k=k) print('Accuracy for fold %d: %s' % (myfold, results)) correctness += results print("Average accuracy for all folds:", correctness / len(myfolds))
def training(self, first_n_files=10000): self.ev = Evaluation() self.ml = Rocchio(verbose=VERBOSE, fold='n/a', config=self.config, ev=self.ev) self.parse_files(fold="sgms/TRAIN.jdb", mode='training', first_n_files=first_n_files, ml=self.ml, config=self.config, prep=self.prep) self.ml.do_padding() self.ml.calculate_training_data() self.ml.diagnose()
class Classify(): def __init__(self, printMessage=None): self.prep = SimpleAnalyzer(expression=r"[A-Za-z]*", gaps=False) self.printMessage = printMessage def setConfig(self): self.config = Configuration(config_file='classify.conf') try: self.config.load_configuration() config_data = self.config.get_configuration() except: self.printMessage("Error loading configuration file.") self.printMessage("Classifier aborting.") raise # config.display_configuration() self.printMessage(self.config) # Parse not any more than the first_n_files in folder # @ml: Object for our classifier class (Rocchio, kNN, etc) # @config: Our configuration class (class as in OOP not ML) # @prep: Preprocessor class; tokenizers, stemmers, etc. def parse_files(self, fold=1, mode = "training", first_n_files = 500, ml=object, config=object, prep=object): config_data = config.get_configuration() fd = open(fold, 'r') file_data = fd.read() objs = json.loads(file_data) counter = 0; length = len(objs) if length > first_n_files : length = first_n_files self.printMessage("%s data length: %d" % (mode,length)) for i in objs: if counter > first_n_files: break counter += 1 if counter % 100 == 0: self.printMessage(counter) doc_id = objs[i][unicode('id')] # terms = prep.ngram_tokenizer(text=objs[i][unicode('content')]) terms = [token.text for token in prep(objs[i][unicode('content')]) if token.text != "" ] # self.printMessage(terms) for class_name in objs[i][unicode('topics')]: if mode == 'training': ml.add_doc(doc_id = doc_id, doc_class=class_name, doc_terms=terms) else: # Class known from filename ml.add_query(query_id = doc_id, query_class=class_name, query_terms=terms) fd.close() # Let's do some workout now on all folders but one def training(self, first_n_files=10000): self.ev = Evaluation() self.ml = Rocchio(verbose=VERBOSE, fold='n/a', config=self.config, ev=self.ev) self.parse_files(fold="sgms/TRAIN.jdb", mode='training', first_n_files=first_n_files, ml=self.ml, config=self.config, prep=self.prep) self.ml.do_padding() self.ml.calculate_training_data() self.ml.diagnose() # Let's test on the remaining folder def testing(self, first_n_files=10000): self.parse_files(fold="sgms/TEST.jdb", mode='testing', first_n_files=first_n_files, ml=self.ml, config=self.config, prep=self.prep) self.ml.compare_queries() def result(self): results = self.ev.calculate(review_spam=True, k=self.config.get_configuration()['k']) self.printMessage(results)