-
Notifications
You must be signed in to change notification settings - Fork 0
/
classifier.py
116 lines (100 loc) · 4.17 KB
/
classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from bs4 import BeautifulSoup
from apps.models import Apps
from xml.sax.saxutils import escape
import elementtree.ElementTree as ET
import requests
import base64
writeKey = '****'
readKey = '****'
# def create_classifier(classifier, class_id):
# writeCalls = ET.SubElement(root, 'writeCalls', writeApiKey=writeKey, classifierName=classifier)
# create = ET.SubElement(writeCalls, 'create', id=class_id)
# return root
def getInfo():
root = ET.Element('uclassify', xmlns='http://api.uclassify.com/1/RequestSchema', version='1.01')
readCalls = ET.SubElement(root, 'readCalls', readApiKey=readKey)
getInformation = ET.SubElement(readCalls, 'getInformation', id='GetInformation', classifierName='subjects')
return root
def train(text, className, new=0):
root = ET.Element('uclassify', xmlns='http://api.uclassify.com/1/RequestSchema', version='1.01')
texts = ET.SubElement(root, 'texts')
textBase64 = ET.SubElement(texts, 'textBase64', id='text1')
textBase64.text = base64.b64encode(text.encode('UTF-8'))
writeCalls = ET.SubElement(root, 'writeCalls', writeApiKey=writeKey, classifierName='subjects')
if new == 1:
addClass = ET.SubElement(writeCalls, 'addClass', id='add1', className=className)
train = ET.SubElement(writeCalls, 'train', id='train1', className=className, textId='text1')
return root
def read(text):
root = ET.Element('uclassify', xmlns='http://api.uclassify.com/1/RequestSchema', version='1.01')
texts = ET.SubElement(root, 'texts')
textBase64 = ET.SubElement(texts, 'textBase64', id='text1')
textBase64.text = base64.b64encode(text.encode('UTF-8'))
readCalls = ET.SubElement(root, 'readCalls', readApiKey=readKey)
classify = ET.SubElement(readCalls, 'classify', id='classify1', classifierName='subjects', textId='text1')
return root
def post(root):
r = requests.post('http://api.uclassify.com', ET.tostring(root, encoding='UTF-8'), headers={'Content-Type': 'text/xml'})
return r.content
def classify_subject(text):
root = read(text)
soup = BeautifulSoup(post(root))
results = soup.find_all('class') # find all probabilities in xml
probs = [(result.get('classname'), float(result.get('p'))) for result in results] # make a list of tuples (subject, probability)
top = max(probs, key=lambda x: x[1]) # get (subject, probability) tuple with highest probability
# if top[1] >= 0.7:
# post(train(text, top[0]))
# return top[0]
if True: # pick subject or add a new class
print "Do any of these subjects match that of the app?"
for i, (subject, probability) in enumerate(probs, 1):
print "%d. Subject: %s; Probability of match: %f." % (i, subject, probability)
try:
input_choice = int(raw_input("Type the number you choose. If you want to create a new class, enter 0; '-1' if you don't know/want to exit: "))
if input_choice < -1:
print "Not a valid choice."
elif input_choice == -1:
return ''
elif input_choice == 0:
input_subject = raw_input("Type the name of the subject this app falls under: ")
post(train(text, input_subject, new=1)) # create new className
return input_subject # create new class here
else:
try:
post(train(text, probs[input_choice-1][0])) # train from list of possible subjects
print probs[input_choice-1][0]
return probs[input_choice-1][0]
except IndexError:
print "Try another number."
except ValueError:
print "That is not a valid number. Try again."
return ''
def choice(probs):
pass
def main():
generic_subjects = ['Books', 'Reference', 'Education', 'Business', 'Productivity', 'Medical', '']
a = Apps.objects.filter(subject__in=generic_subjects)
for app in a:
if app.description:
m1 = Apps.objects.filter(link=app.link)
print app.name
print app.description
subject = classify_subject(app.description)
m1.update(subject=subject)
print "The app '%s' was successfully classified as '%s'!" % (app.name, subject)
#main()
# def process():
# # pseudocode
# do:
# readCalls()
# if p < 0.7:
# pick a subject or add a new one
# else:
# return the subject
# if thing[1] >= 0.7:
# return thing[0]
# else:
# return ''
# train(thing[0])
# if "Couldn't find any classifier with the name" in q:
# if "Trying to classify a text before any classes have been added!" in q: