Exemplo n.º 1
0
def prep(type):
    global probs
    global classifiers
    global doclens
    global truedocs

    global r_n

    probs = {}
    classifiers = {}

    # read classifiers
    for line in open(util.projdir + "/corpus/" + type + "-class", "r"):
        c = line.split(' ')
        classifiers[c[0].split('-')[1]] = c[1].split('\n')[0]

    in_doc = {}
    r_n = 0
    # read counts
    tmp_probs = {}
    for docid, c in classifiers.iteritems():
        probs[docid] = {}
        if c == "satire":
            r_n += 1
        if c == "true" or type == "test":
            truedocs.append(docid)
        tmp_probs[docid] = util.read_counts("/data/bag/" + type + "/" + type +
                                            "-" + docid)
        doclens[docid] = float(sum([c for c in tmp_probs[docid].values()]))
        for w in tmp_probs[docid].keys():
            probs[docid][w.lower()] = probs[docid].get(w, 0.0) / doclens[docid]
        in_doc[docid] = {}
        for w in probs[docid].keys():
            in_doc[docid][w] = True
Exemplo n.º 2
0
def prep(type):
	global probs
	global classifiers
	global doclens
	global truedocs

	global r_n

	probs = {}
	classifiers = {}

	# read classifiers
	for line in open(util.projdir + "/corpus/"+type+"-class", "r"):
		c = line.split(' ')
		classifiers[c[0].split('-')[1]] = c[1].split('\n')[0]

	in_doc = {}
	r_n = 0
	# read counts
	tmp_probs = {}
	for docid,c in classifiers.iteritems():
		probs[docid] ={}
		if c=="satire":
			r_n += 1
		if c=="true" or type=="test":
			truedocs.append(docid)
		tmp_probs[docid] = util.read_counts("/data/bag/"+type+"/"+type+"-"+docid)
		doclens[docid] = float(sum([c for c in tmp_probs[docid].values()]))
		for w in tmp_probs[docid].keys():
			probs[docid][w.lower()] = probs[docid].get(w,0.0)/doclens[docid] 
		in_doc[docid] = {}
		for w in probs[docid].keys():
			in_doc[docid][w] = True
Exemplo n.º 3
0
# grooms training data and produces initial probabilities
# keep this file in $proj

# document x^i for i=1,...,m is a sequence of words x_1^i,...,x_n^i
# y^i for i=1,...,m is a hidden boolean. y=true --> x^i is classified as satire
	
import util 

types = ["all", "satire", "true"]

counts = {}	# word counts
sum = {}	# total word count
prob = {}	# P(y)

for type in types:
	counts[type] = util.read_counts(projdir + "/data/bag/training/count-"+type)
#	for w,c in counts[type].iteritems():
#		fout_word.write(w+' %f'%(c/sum[type])+'\n')

# P(y) forall y
#fin_type = open(projdir + "/data/bag/training/prob-type", "r")			# P(y) -- overall probability of satire categorization
#for line in fin_type:
#	sp = line.split(' ')
#	prob[sp[0]] = float(sp[1])
#for type in ["satire", "true", "all"]:
#	prob[type] = prob[type] / prob["all"]

# calculate P(y|x) forall words x in vocabulary
for type in ["satire", "true"]: # for y=true,false
	fout_doc = open(projdir + "/data/bag/training/prob-doc-"+type, "w")	# P(y|x) -- probability of satire categorization given word 
#	const = sum[type] * prob[type] / sum["all"]