Exemplo n.º 1
0
def main():
	optparser = OptionParser()
	
	optparser.add_option('-x', '--dname_x', action='store', type = 'str', dest='dname_x')
	optparser.add_option('-s', '--dname_xsup', action='store', type = 'str', dest='dname_xsup')
	optparser.add_option('-k', '--value_k', dest='value_k', type='float', action = 'store', default = 1.)
	optparser.add_option('-y', '--ydim', action='store', type='int', dest='ydim', default = N_EMO)

	opts, args = optparser.parse_args()

	print >> sys.stderr, 'nbdatica: [info] loading data for training NaiveBayes ... ',
	train, valid, test = datica.load_data(opts.dname_x, opts.ydim, valid_rate = 0.)
	print >> sys.stderr, 'OK'

	print >> sys.stderr, 'nbdatica: [info] training NaiveBayes ... ',	
	classifier = NaiveBayesClassifier()
	classifier.train(train[0], train[1], opts.value_k)
	print >> sys.stderr, 'OK'

	if not os.path.exists(opts.dname_xsup):
		os.mkdir(opts.dname_xsup)

	pbar = progbar.start(opts.ydim)
	for eid in range(opts.ydim):
		ifname = opts.dname_x + '%d.pkl'%(eid)
		seqs = cPickle.load(open(ifname, 'r'))

		ofname = opts.dname_xsup + '%d.pkl'%(eid)
		proba = [classifier.classify(seq) for seq in seqs]

		cPickle.dump(proba, open(ofname, 'w'))
		pbar.update(eid + 1)
	pbar.finish()
Exemplo n.º 2
0
def prepare_above_naivebayes(dname_dataset, idname, odname, n_emo, k = 1, ratio = 0.9):
	train_x = []
	train_y = []
	dlist = []


	dir_dataset = 'data/blogs/%s/'%(dname_dataset)

	idir = dir_dataset + '%s/'%(idname)
	odir = dir_dataset + '%s/'%(odname)

	init_folders([odir, ])

	print >> sys.stderr, 'contextprocessor: [info] loading data'
	for eid in range(n_emo):
		xlist = []

		ifname = idir + '%d.pkl'%(eid)
		contextu = cPickle.load(open(ifname, 'r'))
		
		n_train = int(len(contextu) * ratio)

		for i, comms in enumerate(contextu):
			tokens = []
			for ts, emos in comms:
				tokens.extend(ts)

			xlist.append(tokens)

			if i < n_train:
				train_x.append(tokens)
				train_y.append(eid)

		dlist.append(xlist)

		print >> sys.stderr, '\t%s OK'%(ifname)

	print >> sys.stderr, 'contextprocessor: [info] training naive bayes classifier'
	classifier = NaiveBayesClassifier()
	classifier.train(train_x, train_y, k)
	
	print >> sys.stderr, 'contextprocessor: [info] exporting naive bayes result'
	for eid, xlist in enumerate(dlist):
		probs = []
		for tokens in xlist:
			probs.append(classifier.classify(tokens))
		
		ofname = odir + '%d.pkl'%(eid)
		print >> sys.stderr, '\t%s OK'%(ofname)
		cPickle.dump(probs, open(ofname, 'w'))
Exemplo n.º 3
0
def main():
	optparser = OptionParser()

	# necessary
	optparser.add_option('-p', '--prefix', action='store', type = 'str', dest='prefix')
	optparser.add_option('-x', '--dir_x', action='store', type = 'str', dest='dir_x')
	optparser.add_option('-y', '--ydim', action='store', type='int', dest='ydim')

	optparser.add_option('-k', '--value_k', dest='value_k', type='float', action = 'store', default = 1.)

	# debug
	optparser.add_option('-n', '--n_samples', action='store', dest='n_samples', default = None)

	opts, args = optparser.parse_args()
	
	#################### Preparation of Input ##############
	print >> sys.stderr, 'lstmscript.run: [info] loading dataset ... ', 
	
	n_emo = opts.ydim
	datalen = opts.n_samples		
	dataset = datica.load_data(opts.dir_x, opts.ydim, datalen) 

	print >> sys.stderr, 'Done'

	def merge_train_valid(dataset):
		train, valid, test = dataset
		tx, ty = train
		vx, vy = valid
		tx.extend(vx)
		ty.extend(vy)
		return (tx, ty), test

	dataset = merge_train_valid(dataset)
	train, test = dataset

	classifier = NaiveBayesClassifier()
	classifier.train(train[0], train[1], opts.value_k)
	preds = [classifier.classify(x) for x in test[0]]

	fname_test = 'data/dataset/test/%s_test.pkl'%(opts.prefix)
	fname_valid = 'data/dataset/test/%s'%(opts.prefix)

	cPickle.dump((test[1], preds), open(fname_test, 'w'))
	validatica.report(test[1], preds, fname_valid)
Exemplo n.º 4
0
def main():
	optparser = OptionParser()

	# necessary
	optparser.add_option('-p', '--prefix', action='store', type = 'str', dest='prefix')
	optparser.add_option('-k', '--value_k', dest='value_k', type='float', action = 'store', default = 1.)
	optparser.add_option('-u', '--unigram', action='store_true', dest='unigram', default = False)
	optparser.add_option('-d', '--deduplicate', dest='flag_deduplicate', action = 'store_true', default = False)

	# debug
	optparser.add_option('-y', '--ydim', action='store', type='int', dest='ydim', default = N_EMO)
	optparser.add_option('-n', '--n_samples', action='store', dest='n_samples', default = None)

	opts, args = optparser.parse_args()

	if opts.unigram:
		dataset = datica.load_unigram(opts.ydim, opts.n_samples)
	else:
		dataset = datica.load_token(opts.ydim, opts.n_samples)

	def merge_train_valid(dataset):
		train, valid, test = dataset
		tx, ty = train
		vx, vy = valid
		tx.extend(vx)
		ty.extend(vy)
		return (tx, ty), test

	dataset = merge_train_valid(dataset)
	train, test = dataset

	classifier = NaiveBayesClassifier()
	classifier.train(train[0], train[1], opts.value_k, opts.flag_deduplicate)
	
	preds = [classifier.classify(x) for x in train[0]]
	prec = validatica.precision_at_n(train[1], preds)
	print prec
Exemplo n.º 5
0
from naivebayes import NaiveBayesClassifier
import os
import re
import codecs
from segmentor import Segmentor

def corpus_generator(segmentor):
    for corpus in map(lambda x: "sentiment_corpus/" + x, ["Ctrip_htl_ba_4000", "Dangdang_Book_4000", "Jingdong_NB_4000"]):
        classes = filter(lambda x: x[0] != ".", os.listdir(corpus))
        for cls in classes:
            print "Enumerating for '%s/%s' reviews." % (corpus, cls)
            cls_dir = os.path.join(corpus, cls)
            files = filter(lambda x: x.endswith(".txt"), os.listdir(cls_dir))
            for filename in files:
                with codecs.open(os.path.join(cls_dir, filename), "r", encoding="utf8") as file:
                    for line in file:
                        if not line.strip():
                            continue
                        words = segmentor(line.strip())
                        yield (cls, words)

segmentor = Segmentor()
generator = corpus_generator(segmentor)
classifier = NaiveBayesClassifier()
classifier.train(generator)

print classifier.classify(segmentor(u"这一地区生鲜奶收购价持续在低位徘徊,导致很多奶户入不敷出,被迫“砍牛”(杀牛或卖牛)。 近期,双鸭山市多地奶农联名向记者反映"))

# print classifier.classify("This is awesome but still I don't like it thisisaweirdwordneveroccurs. ".split(" "))
# print classifier.classify("iqbvajkkjbarjta".split(" "))
# print classifier.classify("I don't recommend.".split(" "))
Exemplo n.º 6
0
from naivebayes import NaiveBayesClassifier
import os
import re


def review_generator(dir):
    classes = os.listdir(dir)
    for cls in classes:
        print "Enumerating for '%s' reviews." % cls
        cls_dir = os.path.join(dir, cls)
        files = filter(lambda x: x.endswith(".txt"), os.listdir(cls_dir))
        for filename in files:
            with open(os.path.join(cls_dir, filename), "r") as file:
                for line in file:
                    words = line.split()
                    words = filter(lambda x: re.match(r'^\w{3,}$', x), words)
                    yield (cls, words)


generator = review_generator("txt_sentoken")
classifier = NaiveBayesClassifier()
classifier.train(generator)

print classifier.classify(
    "This is awesome but still I don't like it thisisaweirdwordneveroccurs. ".
    split(" "))
print classifier.classify("iqbvajkkjbarjta".split(" "))
print classifier.classify("".split(" "))
Exemplo n.º 7
0
from naivebayes import NaiveBayesClassifier
import os
import re

def review_generator(dir):
    classes = os.listdir(dir)
    for cls in classes:
        print "Enumerating for '%s' reviews." % cls
        cls_dir = os.path.join(dir, cls)
        files = filter(lambda x: x.endswith(".txt"), os.listdir(cls_dir))
        for filename in files:
            with open(os.path.join(cls_dir, filename), "r") as file:
                for line in file:
                    words = line.split()
                    words = filter(lambda x: re.match(r'^\w{3,}$', x), words)
                    yield (cls, words)

generator = review_generator("txt_sentoken")
classifier = NaiveBayesClassifier()
classifier.train(generator)

print classifier.classify("This is awesome but still I don't like it thisisaweirdwordneveroccurs. ".split(" "))
print classifier.classify("iqbvajkkjbarjta".split(" "))
print classifier.classify("".split(" "))