def setUp(self): self.srv = TestUtil.fork_process("classifier", port) self.cli = classifier(host, port) method = "AROW" self.converter = '{\n"string_filter_types":{}, \n"string_filter_rules":[], \n"num_filter_types":{}, \n"num_filter_rules":[], \n"string_types":{}, \n"string_rules":\n[{"key":"*", "type":"space", \n"sample_weight":"bin", "global_weight":"bin"}\n], \n"num_types":{}, \n"num_rules":[\n{"key":"*", "type":"num"}\n]\n}' cd = config_data(method, self.converter) self.cli.set_config("name", cd)
class TweetAnalyzer(StreamListener): classifier = client.classifier(host, port) def __init__(self, highlight): super(TweetAnalyzer, self).__init__() self.highlight = highlight def on_status(self, status): if not hasattr(status, 'text'): return d = types.datum([], []) d.string_values = [ ['text', status.text], ] result = self.classifier.classify(instance_name, [d]) if len(result) > 0 and len(result[0]) > 0: # sort the result in order of score est = sorted(result[0], key=lambda est: est.score, reverse=True) print_green(est[0].label, end=" ") if est[0].label == self.highlight: print_red(status.text) else: print(status.text)
def main(): args = parse_options() client = classifier('127.0.0.1', 9199) # train num = 0 if args.traindata: with open(args.traindata, 'rU') as traindata: for data in traindata: # skip comments if not len(data) or data.startswith('#'): continue num += 1 season, avetemp, maxtemp, mintemp, pressure, humidity = map(str.strip, data.strip().split(',')) num_values = [ ['avetemp', float(avetemp)], ['maxtemp', float(maxtemp)], ['mintemp', float(mintemp)], ['pressure', float(pressure)], ['humidity', float(humidity)] ] d = datum([], num_values) train_data = [[season, d]] # train client.train('', train_data) # print train number print 'train ...', num # save train model print "save :", client.save('', "weather") # anaylze with open(args.analyzedata, 'r') as analyzedata: weather = yaml.load(analyzedata) for k, v in weather.iteritems(): print str(k), "(", str(v['season']), ")" num_values = [ ['avetemp', float(v['avetemp'])], ['maxtemp', float(v['maxtemp'])], ['mintemp', float(v['mintemp'])], ['pressure', float(v['pressure'])], ['humidity', float(v['humidity'])] ] d = datum([], num_values) analyze_data = [d] results = client.classify('', analyze_data) results[0].sort(key=lambda x: x.score, reverse=True) for result in results: for i in range(5): print result[i].label, result[i].score print
def setUp(self): self.config = { "method": "AROW", "converter": { "string_filter_types": {}, "string_filter_rules": [], "num_filter_types": {}, "num_filter_rules": [], "string_types": {}, "string_rules": [{"key": "*", "type": "str", "sample_weight": "bin", "global_weight": "bin"}], "num_types": {}, "num_rules": [{"key": "*", "type": "num"}], }, "parameter": {"regularization_weight": 1.001}, } TestUtil.write_file("config_classifier.json", json.dumps(self.config)) self.srv = TestUtil.fork_process("classifier", port, "config_classifier.json") self.cli = classifier(host, port)
class Trainer(StreamListener): classifier = client.classifier(host, port) def __init__(self, locations): super(Trainer, self).__init__() self.locations = locations ''' Format of 'status' can be found in: https://dev.twitter.com/docs/platform-objects/tweets ''' def on_status(self, status): if not hasattr(status, 'text'): return if not hasattr(status, 'coordinates'): return if not status.coordinates or not 'coordinates' in status.coordinates: return loc = None for l in self.locations: coordinates = status.coordinates['coordinates'] if l.is_inside(coordinates[0], coordinates[1]): loc = l break if not loc: # Unknown location return hashtags = status.entities['hashtags'] detagged_text = remove_hashtags_from_tweet(status.text, hashtags) # Create datum for Jubatus d = types.datum([], []) d.string_values = [('text', detagged_text)] # Send training data to Jubatus self.classifier.train(instance_name, [(loc.name, d)]) # Print trained tweet print_green(loc.name, ' ') print detagged_text
def estimate_location_for(text): classifier = client.classifier(host, port) # Create datum for Jubatus d = types.datum([], []) d.string_values = [('text', text)] # Send estimation query to Jubatus result = classifier.classify(instance_name, [d]) if len(result[0]) > 0: # Sort results by score est = sorted(result[0], key=lambda e: e.score, reverse=True) # Print the result print "Estimated Location for %s:" % text for e in est: print " " + e.label + " (" + str(e.score) + ")" else: # No estimation results; maybe we haven't trained enough print "No estimation results available." print "Train more tweets or try using another text."
def train_wikipedia_abstract(label, xmlfile): classifier = client.classifier(host, port) parser = xml.sax.make_parser() parser.setContentHandler(Handler(classifier, label)) parser.parse(xmlfile)
return result def parse_args(): from optparse import OptionParser, OptionValueError p = OptionParser() p.add_option('-s', '--server_ip', action='store', dest='server_ip', type='string', default='192.168.170.129') p.add_option('-p', '--server_port', action='store', dest='server_port', type='int', default='9199') p.add_option('-n', '--name', action='store', dest='name', type='string', default='tutorial') return p.parse_args() if __name__ == '__main__': options,remainder=parse_args() classifier=client.classifier(options.server_ip,options.server_port) pname=options.name print classifier.get_config(pname) print classifier.get_status(pname) for line in open('adult.data'): age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income=line[:-1].split(',') datum=types.datum([('workclass',workclass),('sex',sex),('occupation',occupation),('education',education),('marital_status',marital_status),('native_country',native_country),('race',race),('relationship',relationship)],[('age',float(age)),('hours_per_week',float(hours_per_week)),('education_num',float(education_num))]) classifier.train(pname,[(income,datum)]) pass print classifier.get_status(pname) print classifier.save(pname, "tutorial")
#!/usr/bin/env python import sys, json, subprocess import random from jubatus.classifier import client from jubatus.classifier import types NAME = "a" classifier = client.classifier("127.0.0.1", 9199) file_list = subprocess.check_output(["ls | grep _train.txt"], shell=True).split('\n')[0:-1] fds = map(lambda x: [x.replace("_train.txt", ""), open(x, "r")], file_list) while fds != []: [label, fd] = random.choice(fds) text = fd.readline() if text == "": fds.remove([label, fd]) print("finished train of label %s \n" % (label)) continue text_strip = text.rstrip() datum = types.datum([["text", text_strip]], []) print("train %s : %s ..." % (label, text_strip)) classifier.train(NAME, [(label, datum)])
#!/usr/bin/env python import json, commands from jubatus.classifier import client from jubatus.classifier import types while True: buf = raw_input("> ") if buf == "": break classifier = client.classifier("127.0.0.1", 9199) datum = types.datum([["text", buf.rstrip()]], []) result = classifier.classify("", [datum]) if len(result[0]) == 0: print("nothing") continue result[0].sort(key=lambda x:x.score, reverse=True) for res in result[0]: print(res.label + " -> " + str(res.score))