Exemplo n.º 1
0
	# iterate through all words in this document split by space and create the vector of words
	for word in documents[i].strip().split(' '): 
		if len(word) > 0:	# if this is a word (i.e. not a null character), we add it to the vector
			vector[ dictionary[word] ] = 1	# the word is in this document, so we set it to 1. For the
											# index, we pull it from the word dictionary we created above
	
	# update the current document we're processing with the new vector of words contained in that document
	documents[i] = vector

# Create Gaussian NB classifier and fit it to the data
gnb = GaussianNB()	# create the classifier object
gnbfit = gnb.fit(documents, labels)	# fit (/train) the classifier to the data

pred = gnbfit.predict(documents) # classify the documents again using the classifier to test
								 # for accuracy

print len(dictionary),"words in the word dictionary"
print len(documents),"data points in the training set"
print "f1-score (weighted) of classifier on training set is:", f1_score(labels, pred, average='weighted')

f = open('test.txt', 'r')
test = cleanBody.cleanBody(f.readline())

vector = [0] * len(dictionary)
for word in test.strip().split(' '):
	if len(word) > 0 and word in dictionary:
		vector[ dictionary[word] ] = 1

prediction = gnbfit.predict(vector)
print "Predicted as category: %s (id:%d)" % (categories[prediction], prediction)
Exemplo n.º 2
0
# basketball (3)
# nil (4)

max_per_category = 100
c = 0

print "Size of articles:", len(articles)

# for each article in articles
for arturl in articles:
    # if all categories have more than max_per_category then
    # quit looping. nil category excluded
    print count[2:4]
    if min(count[2:4]) < max_per_category:
        c += 1
        print c, "..", arturl
        category,content = getPage.getPage(arturl)
        if category == False and content == False:  # no content on this page? add it to the nil category
            count[4] += 1
        else:
            if count[category - 1] < max_per_category:
                count[category - 1] += 1
                data.append( "%d %s" % (category,cleanBody.cleanBody(content)) )
    else:
        break

print data        
print count

file = open("data.txt", "w")
file.write("\n".join(data).encode('utf-8'))
Exemplo n.º 3
0
                word]] = 1  # the word is in this document, so we set it to 1. For the
            # index, we pull it from the word dictionary we created above

    # update the current document we're processing with the new vector of words contained in that document
    documents[i] = vector

# Create Gaussian NB classifier and fit it to the data
gnb = GaussianNB()  # create the classifier object
gnbfit = gnb.fit(documents, labels)  # fit (/train) the classifier to the data

pred = gnbfit.predict(
    documents)  # classify the documents again using the classifier to test
# for accuracy

print len(dictionary), "words in the word dictionary"
print len(documents), "data points in the training set"
print "f1-score (weighted) of classifier on training set is:", f1_score(
    labels, pred, average='weighted')

f = open('test.txt', 'r')
test = cleanBody.cleanBody(f.readline())

vector = [0] * len(dictionary)
for word in test.strip().split(' '):
    if len(word) > 0 and word in dictionary:
        vector[dictionary[word]] = 1

prediction = gnbfit.predict(vector)
print "Predicted as category: %s (id:%d)" % (categories[prediction],
                                             prediction)