示例#1
0
def separator_test(n=2000):
    print 'separator test', 'n=', n
    stream = fs.start_stream(learner='rf_classifier', target='anomaly')
    print "stream_id=", stream.stream_id
    for _ in xrange(n):
        stream.train({
            'size': random.randint(1, 50),
            'type': random.choice(['a', 'b', 'c']),
            'anomaly': 'true'
        })
        stream.train({
            'size': random.randint(50, 100),
            'type': random.choice(['d', 'a', 'f']),
            'anomaly': 'false'
        })
    # predictions
    ok = True
    x = stream.predict({'size': 99})['prediction']
    print x, 'expected false'
    ok &= (x == 'false')
    x = stream.predict({'size': 40, 'type': 'a'})['prediction']
    print x, 'expected true'
    ok &= (x == 'true')
    x = stream.predict({'type': 'f'})['prediction']
    print x, 'expected false'
    ok &= (x == 'false')
    stream.close()
    return ok
示例#2
0
def train_test(train_filename,
               test_filename=None,
               target=None,
               learner=None,
               types={},
               error=None,
               verbose=False):

    stream = fs.start_stream(learner=learner, target=target)
    print 'stream_id=', stream.stream_id

    print 'training...', train_filename
    for event in csv_iterator(train_filename, types=types):
        ok = stream.train(event)
        if not ok: break

    print 'stats='
    pprint.pprint(stream.stats())

    if (test_filename is not None):
        print 'sleeping'
        time.sleep(15)
        print 'testing...', test_filename
        stream.clear_stats()
        n_correct = 0
        n_total = 0
        se = 0
        for event in csv_iterator(test_filename, types=types):
            #            print event
            prediction = stream.predict(event)['prediction']

            actual = event[target]
            if verbose:
                print 'actual=', actual, 'predicted=', prediction, 'event=', event
            if (error == 'rmse'):
                # for regression
                if not isnan(float(prediction)):
                    se += (float(prediction) - float(actual))**2
            if (error == 'accuracy'):
                # for classification
                if (prediction == actual):
                    n_correct += 1
            n_total += 1

        print 'n_total=', n_total
        if (error == 'accuracy'):
            print 'n_correct, accuracy=', (n_correct) / (float(n_total))
        if (error == 'rmse'):
            print 'rmse=', sqrt(se / n_total)

    stream.close()
示例#3
0
def classify_test(f,n=4000,mink=0,maxk=100):
	stream = fs.start_stream(learner='rf_classifier', target='f')
	print 'n=',n,'stream_id=',stream.stream_id
	# train phase
	for _ in xrange(n):
		j = random.randint(mink,maxk)
		event={}
		event['value']=j
		event['f']=f(j)
		ok = stream.train(event)
		if not ok: return
	# test phase
	# todo

	accuracy = stream.stats('accuracy')
	stream.close()
	return accuracy
示例#4
0
def classify_test(f, n=4000, mink=0, maxk=100):
    stream = fs.start_stream(learner='rf_classifier', target='f')
    print 'n=', n, 'stream_id=', stream.stream_id
    # train phase
    for _ in xrange(n):
        j = random.randint(mink, maxk)
        event = {}
        event['value'] = j
        event['f'] = f(j)
        ok = stream.train(event)
        if not ok: return
    # test phase
    # todo

    accuracy = stream.stats('accuracy')
    stream.close()
    return accuracy
示例#5
0
def train_test(train_filename, test_filename=None, target=None, learner=None, types={}, error=None, verbose=False):

    stream = fs.start_stream(learner=learner, target=target)
    print 'stream_id=',stream.stream_id

    print 'training...',train_filename
    for event in csv_iterator(train_filename, types=types):
        ok = stream.train(event)
        if not ok: break
        
    print 'stats='
    pprint.pprint(stream.stats())
    
    if (test_filename is not None):
	print 'sleeping'
	time.sleep(15)
        print 'testing...',test_filename
        stream.clear_stats()
        n_correct=0
        n_total=0
        se=0
        for event in csv_iterator(test_filename, types=types):
#            print event
            prediction = stream.predict(event)['prediction']
        
            actual = event[target]
            if verbose: print 'actual=',actual,'predicted=',prediction,'event=',event
            if (error=='rmse'):
                # for regression
                if not isnan(float(prediction)):
                    se+=(float(prediction)-float(actual))**2
            if (error=='accuracy'):
                # for classification
                if (prediction==actual):
                    n_correct+=1
            n_total+=1

        print 'n_total=',n_total
        if (error=='accuracy'):
            print 'n_correct, accuracy=',(n_correct)/(float(n_total))
        if (error=='rmse'):
            print 'rmse=',sqrt(se/n_total)

    stream.close()
示例#6
0
def separator_test(n=2000):
	print 'separator test','n=',n
	stream=fs.start_stream(learner='rf_classifier', target='anomaly')
	print "stream_id=",stream.stream_id
	for _ in xrange(n):
		stream.train({'size':random.randint(1,50), 'type':random.choice(['a','b','c']), 'anomaly':'true'})
		stream.train({'size':random.randint(50,100), 'type':random.choice(['d','a','f']), 'anomaly':'false'})
	# predictions
	ok=True
	x=stream.predict({'size':99})['prediction']
	print x, 'expected false'
	ok &= (x=='false')
	x=stream.predict({'size':40, 'type':'a'})['prediction']
	print x, 'expected true'
	ok &= (x=='true')
	x=stream.predict({'type':'f'})['prediction']
	print x, 'expected false'
	ok &= (x=='false')
	stream.close()
	return ok
示例#7
0
    return set(part[1:] for part in s.split() if part.startswith('#'))


def train(stream, filename):
    i = 0
    with open(filename, 'rb') as file_in:
        for line in file_in:
            try:
                tweet = json.loads(line)
                text = tweet['text']
                print text
                hashtags = extract_hashtags(text)
                for hashtag in hashtags:
                    event = {'text': text, 'hashtag': hashtag}
                    ok = stream.train(event, types={'text': 'TEXT'})
                    if not ok: break
            except KeyError:
                pass


fs.set_endpoint('http://vm:8088/mungio/api')
stream = fs.start_stream(targets={'hashtag': 'CATEGORIC'})

print 'training from ', filename
train(stream, filename)

print 'getting stream info'
print stream.get_info()

stream.close()
示例#8
0
 def __init__(self, learner, target='target'):
     self.target = target
     self.stream = fs.start_stream(learner=learner, target=self.target)
示例#9
0
def extract_hashtags(s):
	return set(part[1:] for part in s.split() if part.startswith('#'))

def train(stream, filename):
	i=0
	with open(filename, 'rb') as file_in:
		for line in file_in:
			try:
				tweet = json.loads(line)
				text = tweet['text']
				print text
				hashtags = extract_hashtags(text)
				for hashtag in hashtags:
					event = {'text': text, 'hashtag':hashtag}
					ok=stream.train(event,types={'text':'TEXT'})
					if not ok: break
			except KeyError:
				pass

fs.set_endpoint('http://vm:8088/mungio/api')
stream = fs.start_stream(targets={'hashtag':'CATEGORIC'})

print 'training from ',filename
train(stream, filename)

print 'getting stream info'
print stream.get_info()

stream.close()
  {'name': 'statuses-count', 'source': '[twitter][retweet][user][statuses_count]'}
]}

def train(stream, file):

	transform = ExtractFieldsTransform(mapping)
	with gzip.open(file, 'rb') as file_in:
		for event in file_in:
			try:
				training_data = transform.transform(json.loads(event))
				print training_data
				# from unicode, strip punctuation and go lowercase
				# (now done in stringtokenizer transformer on server)
#				s = string.lower(training_data['content'].encode('ascii','ignore'))
#				training_data['content'] = s.translate(string.maketrans("",""),string.punctuation)
				training_data['positive'] = '+' if training_data['sentiment'] >= 0 else '-'
				del training_data['sentiment'] # target leak			
				ok = stream.train(training_data)
				if not ok: break
			except KeyError:
				# either a tick or missing gender/content
				pass

stream = fs.start_stream('rf_classifier','positive')

print 'training from ',file
train(stream, file)

print 'getting stream info'
print stream.get_stats()
示例#11
0
from featurestream import transform
import sys

filename = sys.argv[1]

def train(stream, filename):
	i=0
	with open(filename, 'rb') as file_in:
		for line in file_in:
			try:
				tweet = json.loads(line)
				coords = tweet['geo']['coordinates']
				event = {'lat': coords[0], 'long': coords[1]}
				ok=stream.train(event)
				if not ok: break
				i+=1
				if (i%1000==0): print i
			except KeyError:
				pass

clusterer = fs.start_stream(learner='clustering')
print 'got stream_id =',clusterer.stream_id

print 'training from ',filename
train(clusterer, filename)

print 'getting stream info'
print clusterer.get_info()

clusterer.close()
def train(stream, file):

    transform = ExtractFieldsTransform(mapping)
    with gzip.open(file, 'rb') as file_in:
        for event in file_in:
            try:
                training_data = transform.transform(json.loads(event))
                print training_data
                # from unicode, strip punctuation and go lowercase
                # (now done in stringtokenizer transformer on server)
                #				s = string.lower(training_data['content'].encode('ascii','ignore'))
                #				training_data['content'] = s.translate(string.maketrans("",""),string.punctuation)
                training_data['positive'] = '+' if training_data[
                    'sentiment'] >= 0 else '-'
                del training_data['sentiment']  # target leak
                ok = stream.train(training_data)
                if not ok: break
            except KeyError:
                # either a tick or missing gender/content
                pass


stream = fs.start_stream('rf_classifier', 'positive')

print 'training from ', file
train(stream, file)

print 'getting stream info'
print stream.get_stats()
示例#13
0
			try:
				event = transform.transform(json.loads(line))
				# from unicode, strip punctuation and go lowercase
				# (now done in stringtokenizer transformer on server)
#				s = string.lower(training_data['text'].encode('ascii','ignore'))
#				event['text'] = s.translate(string.maketrans("",""),string.punctuation)

				# add sentiment if exists
				pos = sum([c in event['text'] for c in sentiment_pos])
				neg = sum([c in event['text'] for c in sentiment_neg])
				if (pos + neg > 0):
					#event['sentiment'] = str(pos-neg)
					event['positive']='true' if pos > 0 else 'false'
				ok=stream.train(event)
				if not ok: break
				i+=1
				if (i%1000==0): print i
			except KeyError:
				pass

stream = fs.start_stream(learner='rf_classifier',target='positive')
print 'got stream_id =',stream.stream_id

print 'training from ',filename
train(stream, filename)

print 'getting stream info'
print stream.get_stats()

stream.close()