def train(stream, file):

	transform = ExtractFieldsTransform(mapping)
	with gzip.open(file, 'rb') as file_in:
		for event in file_in:
			try:
				training_data = transform.transform(json.loads(event))
				print training_data
				# from unicode, strip punctuation and go lowercase
				# (now done in stringtokenizer transformer on server)
#				s = string.lower(training_data['content'].encode('ascii','ignore'))
#				training_data['content'] = s.translate(string.maketrans("",""),string.punctuation)
				training_data['positive'] = '+' if training_data['sentiment'] >= 0 else '-'
				del training_data['sentiment'] # target leak			
				ok = stream.train(training_data)
				if not ok: break
			except KeyError:
				# either a tick or missing gender/content
				pass
def train(stream, file):

    transform = ExtractFieldsTransform(mapping)
    with gzip.open(file, 'rb') as file_in:
        for event in file_in:
            try:
                training_data = transform.transform(json.loads(event))
                print training_data
                # from unicode, strip punctuation and go lowercase
                # (now done in stringtokenizer transformer on server)
                #				s = string.lower(training_data['content'].encode('ascii','ignore'))
                #				training_data['content'] = s.translate(string.maketrans("",""),string.punctuation)
                training_data['positive'] = '+' if training_data[
                    'sentiment'] >= 0 else '-'
                del training_data['sentiment']  # target leak
                ok = stream.train(training_data)
                if not ok: break
            except KeyError:
                # either a tick or missing gender/content
                pass
示例#3
0
def train(stream, filename):
	i=0
	transform = ExtractFieldsTransform(mapping)
	with gzip.open(filename, 'rb') as file_in:
		for line in file_in:
			try:
				event = transform.transform(json.loads(line))
				# from unicode, strip punctuation and go lowercase
				# (now done in stringtokenizer transformer on server)
#				s = string.lower(training_data['text'].encode('ascii','ignore'))
#				event['text'] = s.translate(string.maketrans("",""),string.punctuation)

				# add sentiment if exists
				pos = sum([c in event['text'] for c in sentiment_pos])
				neg = sum([c in event['text'] for c in sentiment_neg])
				if (pos + neg > 0):
					#event['sentiment'] = str(pos-neg)
					event['positive']='true' if pos > 0 else 'false'
				ok=stream.train(event)
				if not ok: break
				i+=1
				if (i%1000==0): print i
			except KeyError:
				pass