def train(stream, file): transform = ExtractFieldsTransform(mapping) with gzip.open(file, 'rb') as file_in: for event in file_in: try: training_data = transform.transform(json.loads(event)) print training_data # from unicode, strip punctuation and go lowercase # (now done in stringtokenizer transformer on server) # s = string.lower(training_data['content'].encode('ascii','ignore')) # training_data['content'] = s.translate(string.maketrans("",""),string.punctuation) training_data['positive'] = '+' if training_data['sentiment'] >= 0 else '-' del training_data['sentiment'] # target leak ok = stream.train(training_data) if not ok: break except KeyError: # either a tick or missing gender/content pass
def train(stream, file): transform = ExtractFieldsTransform(mapping) with gzip.open(file, 'rb') as file_in: for event in file_in: try: training_data = transform.transform(json.loads(event)) print training_data # from unicode, strip punctuation and go lowercase # (now done in stringtokenizer transformer on server) # s = string.lower(training_data['content'].encode('ascii','ignore')) # training_data['content'] = s.translate(string.maketrans("",""),string.punctuation) training_data['positive'] = '+' if training_data[ 'sentiment'] >= 0 else '-' del training_data['sentiment'] # target leak ok = stream.train(training_data) if not ok: break except KeyError: # either a tick or missing gender/content pass
def train(stream, filename): i=0 transform = ExtractFieldsTransform(mapping) with gzip.open(filename, 'rb') as file_in: for line in file_in: try: event = transform.transform(json.loads(line)) # from unicode, strip punctuation and go lowercase # (now done in stringtokenizer transformer on server) # s = string.lower(training_data['text'].encode('ascii','ignore')) # event['text'] = s.translate(string.maketrans("",""),string.punctuation) # add sentiment if exists pos = sum([c in event['text'] for c in sentiment_pos]) neg = sum([c in event['text'] for c in sentiment_neg]) if (pos + neg > 0): #event['sentiment'] = str(pos-neg) event['positive']='true' if pos > 0 else 'false' ok=stream.train(event) if not ok: break i+=1 if (i%1000==0): print i except KeyError: pass