예제 #1
0
 def setUp(self):
     self.srv = TestUtil.fork_process("classifier", port)
     self.cli = classifier(host, port)
     method = "AROW"
     self.converter = '{\n"string_filter_types":{}, \n"string_filter_rules":[], \n"num_filter_types":{}, \n"num_filter_rules":[], \n"string_types":{}, \n"string_rules":\n[{"key":"*", "type":"space", \n"sample_weight":"bin", "global_weight":"bin"}\n], \n"num_types":{}, \n"num_rules":[\n{"key":"*", "type":"num"}\n]\n}'
     cd = config_data(method, self.converter)
     self.cli.set_config("name", cd)
class TweetAnalyzer(StreamListener):
    classifier = client.classifier(host, port)

    def __init__(self, highlight):
        super(TweetAnalyzer, self).__init__()
        self.highlight = highlight

    def on_status(self, status):
        if not hasattr(status, 'text'):
            return

        d = types.datum([], [])
        d.string_values = [
            ['text', status.text],
        ]
        result = self.classifier.classify(instance_name, [d])

        if len(result) > 0 and len(result[0]) > 0:
            # sort the result in order of score
            est = sorted(result[0], key=lambda est: est.score, reverse=True)

            print_green(est[0].label, end=" ")
            if est[0].label == self.highlight:
                print_red(status.text)
            else:
                print(status.text)
예제 #3
0
def main():
  args = parse_options()

  client = classifier('127.0.0.1', 9199)

  # train
  num = 0
  if args.traindata:
    with open(args.traindata, 'rU') as traindata:
      for data in traindata:

        # skip comments
        if not len(data) or data.startswith('#'):
          continue
        num += 1

        season, avetemp, maxtemp, mintemp, pressure, humidity = map(str.strip, data.strip().split(','))
        num_values = [
          ['avetemp', float(avetemp)],
          ['maxtemp', float(maxtemp)],
          ['mintemp', float(mintemp)],
          ['pressure', float(pressure)],
          ['humidity', float(humidity)]
        ]
        d = datum([], num_values)
        train_data = [[season, d]]

        # train
        client.train('', train_data)

    # print train number
    print 'train ...', num
    
    # save train model
    print "save :", client.save('', "weather")

  # anaylze
  with open(args.analyzedata, 'r') as analyzedata:
    weather = yaml.load(analyzedata)
    for k, v in weather.iteritems():
      print str(k), "(", str(v['season']), ")"
      num_values = [
        ['avetemp', float(v['avetemp'])],
        ['maxtemp', float(v['maxtemp'])],
        ['mintemp', float(v['mintemp'])],
        ['pressure', float(v['pressure'])],
        ['humidity', float(v['humidity'])]
      ]
      d = datum([], num_values)
      analyze_data = [d]
      results = client.classify('', analyze_data)
      results[0].sort(key=lambda x: x.score, reverse=True)    
      for result in results:
          for i in range(5):
              print result[i].label, result[i].score
          print
예제 #4
0
    def setUp(self):
        self.config = {
            "method": "AROW",
            "converter": {
                "string_filter_types": {},
                "string_filter_rules": [],
                "num_filter_types": {},
                "num_filter_rules": [],
                "string_types": {},
                "string_rules": [{"key": "*", "type": "str", "sample_weight": "bin", "global_weight": "bin"}],
                "num_types": {},
                "num_rules": [{"key": "*", "type": "num"}],
            },
            "parameter": {"regularization_weight": 1.001},
        }

        TestUtil.write_file("config_classifier.json", json.dumps(self.config))
        self.srv = TestUtil.fork_process("classifier", port, "config_classifier.json")
        self.cli = classifier(host, port)
예제 #5
0
class Trainer(StreamListener):
    classifier = client.classifier(host, port)

    def __init__(self, locations):
        super(Trainer, self).__init__()
        self.locations = locations

    '''
    Format of 'status' can be found in:
        https://dev.twitter.com/docs/platform-objects/tweets
    '''

    def on_status(self, status):
        if not hasattr(status, 'text'):
            return
        if not hasattr(status, 'coordinates'):
            return
        if not status.coordinates or not 'coordinates' in status.coordinates:
            return

        loc = None
        for l in self.locations:
            coordinates = status.coordinates['coordinates']
            if l.is_inside(coordinates[0], coordinates[1]):
                loc = l
                break
        if not loc:
            # Unknown location
            return
        hashtags = status.entities['hashtags']
        detagged_text = remove_hashtags_from_tweet(status.text, hashtags)

        # Create datum for Jubatus
        d = types.datum([], [])
        d.string_values = [('text', detagged_text)]

        # Send training data to Jubatus
        self.classifier.train(instance_name, [(loc.name, d)])

        # Print trained tweet
        print_green(loc.name, ' ')
        print detagged_text
예제 #6
0
def estimate_location_for(text):
    classifier = client.classifier(host, port)

    # Create datum for Jubatus
    d = types.datum([], [])
    d.string_values = [('text', text)]

    # Send estimation query to Jubatus
    result = classifier.classify(instance_name, [d])

    if len(result[0]) > 0:
        # Sort results by score
        est = sorted(result[0], key=lambda e: e.score, reverse=True)

        # Print the result
        print "Estimated Location for %s:" % text
        for e in est:
            print "  " + e.label + " (" + str(e.score) + ")"
    else:
        # No estimation results; maybe we haven't trained enough
        print "No estimation results available."
        print "Train more tweets or try using another text."
예제 #7
0
def estimate_location_for(text):
    classifier = client.classifier(host, port)

    # Create datum for Jubatus
    d = types.datum([], [])
    d.string_values = [('text', text)]

    # Send estimation query to Jubatus
    result = classifier.classify(instance_name, [d])

    if len(result[0]) > 0:
        # Sort results by score
        est = sorted(result[0], key=lambda e: e.score, reverse=True)

        # Print the result
        print "Estimated Location for %s:" % text
        for e in est:
            print "  " + e.label + " (" + str(e.score) + ")"
    else:
        # No estimation results; maybe we haven't trained enough
        print "No estimation results available."
        print "Train more tweets or try using another text."
def train_wikipedia_abstract(label, xmlfile):
    classifier = client.classifier(host, port)

    parser = xml.sax.make_parser()
    parser.setContentHandler(Handler(classifier, label))
    parser.parse(xmlfile)
    return result

def parse_args():
    from optparse import OptionParser, OptionValueError
    p = OptionParser()
    p.add_option('-s', '--server_ip', action='store',
                 dest='server_ip', type='string', default='192.168.170.129')
    p.add_option('-p', '--server_port', action='store',
                 dest='server_port', type='int', default='9199')
    p.add_option('-n', '--name', action='store',
                 dest='name', type='string', default='tutorial')
    return p.parse_args()

if __name__ == '__main__':
	options,remainder=parse_args()
  	classifier=client.classifier(options.server_ip,options.server_port)
   	pname=options.name

   	print classifier.get_config(pname)
    	print classifier.get_status(pname)

       	for line in open('adult.data'):
    		age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income=line[:-1].split(',')
    		datum=types.datum([('workclass',workclass),('sex',sex),('occupation',occupation),('education',education),('marital_status',marital_status),('native_country',native_country),('race',race),('relationship',relationship)],[('age',float(age)),('hours_per_week',float(hours_per_week)),('education_num',float(education_num))])
    		classifier.train(pname,[(income,datum)])
    		pass

    	print classifier.get_status(pname)

	print classifier.save(pname, "tutorial")
예제 #10
0
#!/usr/bin/env python

import sys, json, subprocess
import random
from jubatus.classifier import client
from jubatus.classifier import types

NAME = "a"
classifier = client.classifier("127.0.0.1", 9199)

file_list = subprocess.check_output(["ls | grep _train.txt"],
                                    shell=True).split('\n')[0:-1]

fds = map(lambda x: [x.replace("_train.txt", ""), open(x, "r")], file_list)
while fds != []:
    [label, fd] = random.choice(fds)
    text = fd.readline()
    if text == "":
        fds.remove([label, fd])
        print("finished train of label %s \n" % (label))
        continue
    text_strip = text.rstrip()
    datum = types.datum([["text", text_strip]], [])
    print("train %s : %s ..." % (label, text_strip))
    classifier.train(NAME, [(label, datum)])
예제 #11
0
#!/usr/bin/env python

import json, commands
from jubatus.classifier import client
from jubatus.classifier import types

while True:
    buf = raw_input("> ")
    if buf == "":
        break
    classifier = client.classifier("127.0.0.1", 9199)
    datum = types.datum([["text", buf.rstrip()]], [])
    result = classifier.classify("", [datum])
    if len(result[0]) == 0:
        print("nothing")
        continue
    result[0].sort(key=lambda x:x.score, reverse=True)
    for res in result[0]:
        print(res.label + " -> " + str(res.score))
예제 #12
0
def train_wikipedia_abstract(label, xmlfile):
    classifier = client.classifier(host, port)

    parser = xml.sax.make_parser()
    parser.setContentHandler(Handler(classifier, label))
    parser.parse(xmlfile)