def separator_test(n=2000): print 'separator test', 'n=', n stream = fs.start_stream(learner='rf_classifier', target='anomaly') print "stream_id=", stream.stream_id for _ in xrange(n): stream.train({ 'size': random.randint(1, 50), 'type': random.choice(['a', 'b', 'c']), 'anomaly': 'true' }) stream.train({ 'size': random.randint(50, 100), 'type': random.choice(['d', 'a', 'f']), 'anomaly': 'false' }) # predictions ok = True x = stream.predict({'size': 99})['prediction'] print x, 'expected false' ok &= (x == 'false') x = stream.predict({'size': 40, 'type': 'a'})['prediction'] print x, 'expected true' ok &= (x == 'true') x = stream.predict({'type': 'f'})['prediction'] print x, 'expected false' ok &= (x == 'false') stream.close() return ok
def train_test(train_filename, test_filename=None, target=None, learner=None, types={}, error=None, verbose=False): stream = fs.start_stream(learner=learner, target=target) print 'stream_id=', stream.stream_id print 'training...', train_filename for event in csv_iterator(train_filename, types=types): ok = stream.train(event) if not ok: break print 'stats=' pprint.pprint(stream.stats()) if (test_filename is not None): print 'sleeping' time.sleep(15) print 'testing...', test_filename stream.clear_stats() n_correct = 0 n_total = 0 se = 0 for event in csv_iterator(test_filename, types=types): # print event prediction = stream.predict(event)['prediction'] actual = event[target] if verbose: print 'actual=', actual, 'predicted=', prediction, 'event=', event if (error == 'rmse'): # for regression if not isnan(float(prediction)): se += (float(prediction) - float(actual))**2 if (error == 'accuracy'): # for classification if (prediction == actual): n_correct += 1 n_total += 1 print 'n_total=', n_total if (error == 'accuracy'): print 'n_correct, accuracy=', (n_correct) / (float(n_total)) if (error == 'rmse'): print 'rmse=', sqrt(se / n_total) stream.close()
def classify_test(f,n=4000,mink=0,maxk=100): stream = fs.start_stream(learner='rf_classifier', target='f') print 'n=',n,'stream_id=',stream.stream_id # train phase for _ in xrange(n): j = random.randint(mink,maxk) event={} event['value']=j event['f']=f(j) ok = stream.train(event) if not ok: return # test phase # todo accuracy = stream.stats('accuracy') stream.close() return accuracy
def classify_test(f, n=4000, mink=0, maxk=100): stream = fs.start_stream(learner='rf_classifier', target='f') print 'n=', n, 'stream_id=', stream.stream_id # train phase for _ in xrange(n): j = random.randint(mink, maxk) event = {} event['value'] = j event['f'] = f(j) ok = stream.train(event) if not ok: return # test phase # todo accuracy = stream.stats('accuracy') stream.close() return accuracy
def train_test(train_filename, test_filename=None, target=None, learner=None, types={}, error=None, verbose=False): stream = fs.start_stream(learner=learner, target=target) print 'stream_id=',stream.stream_id print 'training...',train_filename for event in csv_iterator(train_filename, types=types): ok = stream.train(event) if not ok: break print 'stats=' pprint.pprint(stream.stats()) if (test_filename is not None): print 'sleeping' time.sleep(15) print 'testing...',test_filename stream.clear_stats() n_correct=0 n_total=0 se=0 for event in csv_iterator(test_filename, types=types): # print event prediction = stream.predict(event)['prediction'] actual = event[target] if verbose: print 'actual=',actual,'predicted=',prediction,'event=',event if (error=='rmse'): # for regression if not isnan(float(prediction)): se+=(float(prediction)-float(actual))**2 if (error=='accuracy'): # for classification if (prediction==actual): n_correct+=1 n_total+=1 print 'n_total=',n_total if (error=='accuracy'): print 'n_correct, accuracy=',(n_correct)/(float(n_total)) if (error=='rmse'): print 'rmse=',sqrt(se/n_total) stream.close()
def separator_test(n=2000): print 'separator test','n=',n stream=fs.start_stream(learner='rf_classifier', target='anomaly') print "stream_id=",stream.stream_id for _ in xrange(n): stream.train({'size':random.randint(1,50), 'type':random.choice(['a','b','c']), 'anomaly':'true'}) stream.train({'size':random.randint(50,100), 'type':random.choice(['d','a','f']), 'anomaly':'false'}) # predictions ok=True x=stream.predict({'size':99})['prediction'] print x, 'expected false' ok &= (x=='false') x=stream.predict({'size':40, 'type':'a'})['prediction'] print x, 'expected true' ok &= (x=='true') x=stream.predict({'type':'f'})['prediction'] print x, 'expected false' ok &= (x=='false') stream.close() return ok
return set(part[1:] for part in s.split() if part.startswith('#')) def train(stream, filename): i = 0 with open(filename, 'rb') as file_in: for line in file_in: try: tweet = json.loads(line) text = tweet['text'] print text hashtags = extract_hashtags(text) for hashtag in hashtags: event = {'text': text, 'hashtag': hashtag} ok = stream.train(event, types={'text': 'TEXT'}) if not ok: break except KeyError: pass fs.set_endpoint('http://vm:8088/mungio/api') stream = fs.start_stream(targets={'hashtag': 'CATEGORIC'}) print 'training from ', filename train(stream, filename) print 'getting stream info' print stream.get_info() stream.close()
def __init__(self, learner, target='target'): self.target = target self.stream = fs.start_stream(learner=learner, target=self.target)
def extract_hashtags(s): return set(part[1:] for part in s.split() if part.startswith('#')) def train(stream, filename): i=0 with open(filename, 'rb') as file_in: for line in file_in: try: tweet = json.loads(line) text = tweet['text'] print text hashtags = extract_hashtags(text) for hashtag in hashtags: event = {'text': text, 'hashtag':hashtag} ok=stream.train(event,types={'text':'TEXT'}) if not ok: break except KeyError: pass fs.set_endpoint('http://vm:8088/mungio/api') stream = fs.start_stream(targets={'hashtag':'CATEGORIC'}) print 'training from ',filename train(stream, filename) print 'getting stream info' print stream.get_info() stream.close()
{'name': 'statuses-count', 'source': '[twitter][retweet][user][statuses_count]'} ]} def train(stream, file): transform = ExtractFieldsTransform(mapping) with gzip.open(file, 'rb') as file_in: for event in file_in: try: training_data = transform.transform(json.loads(event)) print training_data # from unicode, strip punctuation and go lowercase # (now done in stringtokenizer transformer on server) # s = string.lower(training_data['content'].encode('ascii','ignore')) # training_data['content'] = s.translate(string.maketrans("",""),string.punctuation) training_data['positive'] = '+' if training_data['sentiment'] >= 0 else '-' del training_data['sentiment'] # target leak ok = stream.train(training_data) if not ok: break except KeyError: # either a tick or missing gender/content pass stream = fs.start_stream('rf_classifier','positive') print 'training from ',file train(stream, file) print 'getting stream info' print stream.get_stats()
from featurestream import transform import sys filename = sys.argv[1] def train(stream, filename): i=0 with open(filename, 'rb') as file_in: for line in file_in: try: tweet = json.loads(line) coords = tweet['geo']['coordinates'] event = {'lat': coords[0], 'long': coords[1]} ok=stream.train(event) if not ok: break i+=1 if (i%1000==0): print i except KeyError: pass clusterer = fs.start_stream(learner='clustering') print 'got stream_id =',clusterer.stream_id print 'training from ',filename train(clusterer, filename) print 'getting stream info' print clusterer.get_info() clusterer.close()
def train(stream, file): transform = ExtractFieldsTransform(mapping) with gzip.open(file, 'rb') as file_in: for event in file_in: try: training_data = transform.transform(json.loads(event)) print training_data # from unicode, strip punctuation and go lowercase # (now done in stringtokenizer transformer on server) # s = string.lower(training_data['content'].encode('ascii','ignore')) # training_data['content'] = s.translate(string.maketrans("",""),string.punctuation) training_data['positive'] = '+' if training_data[ 'sentiment'] >= 0 else '-' del training_data['sentiment'] # target leak ok = stream.train(training_data) if not ok: break except KeyError: # either a tick or missing gender/content pass stream = fs.start_stream('rf_classifier', 'positive') print 'training from ', file train(stream, file) print 'getting stream info' print stream.get_stats()
try: event = transform.transform(json.loads(line)) # from unicode, strip punctuation and go lowercase # (now done in stringtokenizer transformer on server) # s = string.lower(training_data['text'].encode('ascii','ignore')) # event['text'] = s.translate(string.maketrans("",""),string.punctuation) # add sentiment if exists pos = sum([c in event['text'] for c in sentiment_pos]) neg = sum([c in event['text'] for c in sentiment_neg]) if (pos + neg > 0): #event['sentiment'] = str(pos-neg) event['positive']='true' if pos > 0 else 'false' ok=stream.train(event) if not ok: break i+=1 if (i%1000==0): print i except KeyError: pass stream = fs.start_stream(learner='rf_classifier',target='positive') print 'got stream_id =',stream.stream_id print 'training from ',filename train(stream, filename) print 'getting stream info' print stream.get_stats() stream.close()