def main(): optparser = OptionParser() optparser.add_option('-x', '--dname_x', action='store', type = 'str', dest='dname_x') optparser.add_option('-s', '--dname_xsup', action='store', type = 'str', dest='dname_xsup') optparser.add_option('-k', '--value_k', dest='value_k', type='float', action = 'store', default = 1.) optparser.add_option('-y', '--ydim', action='store', type='int', dest='ydim', default = N_EMO) opts, args = optparser.parse_args() print >> sys.stderr, 'nbdatica: [info] loading data for training NaiveBayes ... ', train, valid, test = datica.load_data(opts.dname_x, opts.ydim, valid_rate = 0.) print >> sys.stderr, 'OK' print >> sys.stderr, 'nbdatica: [info] training NaiveBayes ... ', classifier = NaiveBayesClassifier() classifier.train(train[0], train[1], opts.value_k) print >> sys.stderr, 'OK' if not os.path.exists(opts.dname_xsup): os.mkdir(opts.dname_xsup) pbar = progbar.start(opts.ydim) for eid in range(opts.ydim): ifname = opts.dname_x + '%d.pkl'%(eid) seqs = cPickle.load(open(ifname, 'r')) ofname = opts.dname_xsup + '%d.pkl'%(eid) proba = [classifier.classify(seq) for seq in seqs] cPickle.dump(proba, open(ofname, 'w')) pbar.update(eid + 1) pbar.finish()
def test_single_training_data(self): classifier = NaiveBayesClassifier() classifier.train((('A', 'a'),)) self.failUnless(classifier.label('a') == 'A') distribution = classifier.label_distribution('a') self.failUnlessEqual(len(distribution), 1) self.failUnless('A' in distribution) self.failUnless(distribution['A'] == 0.0, distribution)
def prepare_above_naivebayes(dname_dataset, idname, odname, n_emo, k = 1, ratio = 0.9): train_x = [] train_y = [] dlist = [] dir_dataset = 'data/blogs/%s/'%(dname_dataset) idir = dir_dataset + '%s/'%(idname) odir = dir_dataset + '%s/'%(odname) init_folders([odir, ]) print >> sys.stderr, 'contextprocessor: [info] loading data' for eid in range(n_emo): xlist = [] ifname = idir + '%d.pkl'%(eid) contextu = cPickle.load(open(ifname, 'r')) n_train = int(len(contextu) * ratio) for i, comms in enumerate(contextu): tokens = [] for ts, emos in comms: tokens.extend(ts) xlist.append(tokens) if i < n_train: train_x.append(tokens) train_y.append(eid) dlist.append(xlist) print >> sys.stderr, '\t%s OK'%(ifname) print >> sys.stderr, 'contextprocessor: [info] training naive bayes classifier' classifier = NaiveBayesClassifier() classifier.train(train_x, train_y, k) print >> sys.stderr, 'contextprocessor: [info] exporting naive bayes result' for eid, xlist in enumerate(dlist): probs = [] for tokens in xlist: probs.append(classifier.classify(tokens)) ofname = odir + '%d.pkl'%(eid) print >> sys.stderr, '\t%s OK'%(ofname) cPickle.dump(probs, open(ofname, 'w'))
def test_single_class_mixed_training_data(self): classifier = NaiveBayesClassifier() classifier.train((('A', 'a'),('A', 'a'),('B', 'a'))) self.failUnless(classifier.label('a') == 'A') distribution = classifier.label_distribution('a') self.failUnlessEqual(len(distribution), 2) self.failUnless('A' in distribution) correct_distribution = Counter() correct_distribution['A'] = (2.0 / 3.0)**3 correct_distribution['B'] = (1.0 / 3.0)**3 correct_distribution.normalize() correct_distribution.log() self.failUnlessAlmostEqual(distribution['A'], correct_distribution['A']) self.failUnlessAlmostEqual(distribution['B'], correct_distribution['B'])
def main(): optparser = OptionParser() # necessary optparser.add_option('-p', '--prefix', action='store', type = 'str', dest='prefix') optparser.add_option('-x', '--dir_x', action='store', type = 'str', dest='dir_x') optparser.add_option('-y', '--ydim', action='store', type='int', dest='ydim') optparser.add_option('-k', '--value_k', dest='value_k', type='float', action = 'store', default = 1.) # debug optparser.add_option('-n', '--n_samples', action='store', dest='n_samples', default = None) opts, args = optparser.parse_args() #################### Preparation of Input ############## print >> sys.stderr, 'lstmscript.run: [info] loading dataset ... ', n_emo = opts.ydim datalen = opts.n_samples dataset = datica.load_data(opts.dir_x, opts.ydim, datalen) print >> sys.stderr, 'Done' def merge_train_valid(dataset): train, valid, test = dataset tx, ty = train vx, vy = valid tx.extend(vx) ty.extend(vy) return (tx, ty), test dataset = merge_train_valid(dataset) train, test = dataset classifier = NaiveBayesClassifier() classifier.train(train[0], train[1], opts.value_k) preds = [classifier.classify(x) for x in test[0]] fname_test = 'data/dataset/test/%s_test.pkl'%(opts.prefix) fname_valid = 'data/dataset/test/%s'%(opts.prefix) cPickle.dump((test[1], preds), open(fname_test, 'w')) validatica.report(test[1], preds, fname_valid)
def main(): optparser = OptionParser() # necessary optparser.add_option('-p', '--prefix', action='store', type = 'str', dest='prefix') optparser.add_option('-k', '--value_k', dest='value_k', type='float', action = 'store', default = 1.) optparser.add_option('-u', '--unigram', action='store_true', dest='unigram', default = False) optparser.add_option('-d', '--deduplicate', dest='flag_deduplicate', action = 'store_true', default = False) # debug optparser.add_option('-y', '--ydim', action='store', type='int', dest='ydim', default = N_EMO) optparser.add_option('-n', '--n_samples', action='store', dest='n_samples', default = None) opts, args = optparser.parse_args() if opts.unigram: dataset = datica.load_unigram(opts.ydim, opts.n_samples) else: dataset = datica.load_token(opts.ydim, opts.n_samples) def merge_train_valid(dataset): train, valid, test = dataset tx, ty = train vx, vy = valid tx.extend(vx) ty.extend(vy) return (tx, ty), test dataset = merge_train_valid(dataset) train, test = dataset classifier = NaiveBayesClassifier() classifier.train(train[0], train[1], opts.value_k, opts.flag_deduplicate) preds = [classifier.classify(x) for x in train[0]] prec = validatica.precision_at_n(train[1], preds) print prec
def test_single_training_data(self): classifier = NaiveBayesClassifier() classifier.train((('A', 'a'), )) self.failUnless(classifier.label('a') == 'A') distribution = classifier.label_distribution('a') self.failUnlessEqual(len(distribution), 1) self.failUnless('A' in distribution) self.failUnless(distribution['A'] == 0.0, distribution)
def find_best_rules(self): ''' Will use the train_list of this class instance to create a temporary classifier that is used to find the most informative features of the training set. This feature list is stored in self.best_feature_list, and "iterated" with self.current_best_feature. ''' train_set = [(word_features(word), outcome) for (word, outcome) in self.train_list] classifier = NaiveBayesClassifier.train(train_set) sorted_feature_list = [i for i in classifier.show_most_informative_features(10000)] #Basically, sort features according to probability. sorted_feature_list.sort(key=lambda feature: feature[1], reverse=True) self.best_feature_list = [i[0] for i in sorted_feature_list] #create letter intersection set to find common letters in the train list for i in self.train_words: self.letter_set = set(i).intersection(self.letter_set)
def test_single_class_mixed_training_data(self): classifier = NaiveBayesClassifier() classifier.train((('A', 'a'), ('A', 'a'), ('B', 'a'))) self.failUnless(classifier.label('a') == 'A') distribution = classifier.label_distribution('a') self.failUnlessEqual(len(distribution), 2) self.failUnless('A' in distribution) correct_distribution = Counter() correct_distribution['A'] = (2.0 / 3.0)**3 correct_distribution['B'] = (1.0 / 3.0)**3 correct_distribution.normalize() correct_distribution.log() self.failUnlessAlmostEqual(distribution['A'], correct_distribution['A']) self.failUnlessAlmostEqual(distribution['B'], correct_distribution['B'])
from naivebayes import NaiveBayesClassifier import os import re import codecs from segmentor import Segmentor def corpus_generator(segmentor): for corpus in map(lambda x: "sentiment_corpus/" + x, ["Ctrip_htl_ba_4000", "Dangdang_Book_4000", "Jingdong_NB_4000"]): classes = filter(lambda x: x[0] != ".", os.listdir(corpus)) for cls in classes: print "Enumerating for '%s/%s' reviews." % (corpus, cls) cls_dir = os.path.join(corpus, cls) files = filter(lambda x: x.endswith(".txt"), os.listdir(cls_dir)) for filename in files: with codecs.open(os.path.join(cls_dir, filename), "r", encoding="utf8") as file: for line in file: if not line.strip(): continue words = segmentor(line.strip()) yield (cls, words) segmentor = Segmentor() generator = corpus_generator(segmentor) classifier = NaiveBayesClassifier() classifier.train(generator) print classifier.classify(segmentor(u"这一地区生鲜奶收购价持续在低位徘徊,导致很多奶户入不敷出,被迫“砍牛”(杀牛或卖牛)。 近期,双鸭山市多地奶农联名向记者反映")) # print classifier.classify("This is awesome but still I don't like it thisisaweirdwordneveroccurs. ".split(" ")) # print classifier.classify("iqbvajkkjbarjta".split(" ")) # print classifier.classify("I don't recommend.".split(" "))
from naivebayes import NaiveBayesClassifier import os import re def review_generator(dir): classes = os.listdir(dir) for cls in classes: print "Enumerating for '%s' reviews." % cls cls_dir = os.path.join(dir, cls) files = filter(lambda x: x.endswith(".txt"), os.listdir(cls_dir)) for filename in files: with open(os.path.join(cls_dir, filename), "r") as file: for line in file: words = line.split() words = filter(lambda x: re.match(r'^\w{3,}$', x), words) yield (cls, words) generator = review_generator("txt_sentoken") classifier = NaiveBayesClassifier() classifier.train(generator) print classifier.classify( "This is awesome but still I don't like it thisisaweirdwordneveroccurs. ". split(" ")) print classifier.classify("iqbvajkkjbarjta".split(" ")) print classifier.classify("".split(" "))
from naivebayes import NaiveBayesClassifier import os import re def review_generator(dir): classes = os.listdir(dir) for cls in classes: print "Enumerating for '%s' reviews." % cls cls_dir = os.path.join(dir, cls) files = filter(lambda x: x.endswith(".txt"), os.listdir(cls_dir)) for filename in files: with open(os.path.join(cls_dir, filename), "r") as file: for line in file: words = line.split() words = filter(lambda x: re.match(r'^\w{3,}$', x), words) yield (cls, words) generator = review_generator("txt_sentoken") classifier = NaiveBayesClassifier() classifier.train(generator) print classifier.classify("This is awesome but still I don't like it thisisaweirdwordneveroccurs. ".split(" ")) print classifier.classify("iqbvajkkjbarjta".split(" ")) print classifier.classify("".split(" "))