class Review(object): StopWords = text_utility.make_stop_words() def __init__(self, id=None, text=None, is_positive=None): self.id = id self.sent = None if text is None else Sentence.from_raw( text, Review.StopWords, neg_mark=True) if self.sent is not None: self.sent.sentiment = is_positive def to_dict(self): return { '_id': self.id, 'text': self.sent.raw, 'words': self.sent.words, 'is_positive': self.sent.sentiment } @staticmethod def from_dict(d): r = Review() r.id = d.get("_id", None) r.sent = Sentence() r.sent.raw = d.get('text', None) r.sent.words = d.get('words', None) r.sent.sentiment = d.get('is_positive', None) return r
def test_sentence(): stopwords = text_utility.make_stop_words() texts = [ "can't is a contraction", "she isn't my wife any more", "I am not in USA right now", "I'm a Chinese", "1630 NE Valley Rd, Pullman, WA, 99163, Apt X103", "I should've done that thing I didn't do", "I don't love her any more", "I want to divorce without hesitation", "bye, Pullman, bye, USA" ] for index, text in enumerate(texts): sent = Sentence.from_raw(text, stopwords, True) print "\n******************** {}".format(index + 1) print sent.raw print "===>" print sent.words
def test_sentence(): stopwords = text_utility.make_stop_words() texts = [ "can't is a contraction", "she isn't my wife any more", "I am not in USA right now", "I'm a Chinese", "1630 NE Valley Rd, Pullman, WA, 99163, Apt X103", "I should've done that thing I didn't do", "I don't love her any more", "I want to divorce without hesitation", "bye, Pullman, bye, USA"] for index,text in enumerate(texts): sent = Sentence.from_raw(text,stopwords,True) print "\n******************** {}".format(index+1) print sent.raw print "===>" print sent.words
import cPickle import numpy as np import pandas as pd from sentence import Sentence import text_utility StopWords = text_utility.make_stop_words() def tokenize_merge(row): allwords = [] for text in row.iloc[1:].dropna(): text = text.lstrip("b\'").lstrip("b\"").lstrip("b\'''") s = Sentence.from_raw(text,StopWords,neg_mark=True) allwords += s.words print allwords# show progress return allwords def load_tokenize(): alldata = pd.read_csv("datas/Combined_News_DJIA.csv") alldata['Date'] = pd.to_datetime(alldata.Date) alldata.set_index('Date',inplace=True) allwords = alldata.apply(tokenize_merge,axis=1) return pd.concat( [alldata.loc[:,'Label'],allwords],axis=1, keys=['label','words']) if __name__ == "__main__": df = load_tokenize() cutoff_dt = pd.to_datetime('2015-01-01')
import cPickle import numpy as np import pandas as pd from sentence import Sentence import text_utility StopWords = text_utility.make_stop_words() def tokenize_merge(row): allwords = [] for text in row.iloc[1:].dropna(): text = text.lstrip("b\'").lstrip("b\"").lstrip("b\'''") s = Sentence.from_raw(text, StopWords, neg_mark=True) allwords += s.words print allwords # show progress return allwords def load_tokenize(): alldata = pd.read_csv("datas/Combined_News_DJIA.csv") alldata['Date'] = pd.to_datetime(alldata.Date) alldata.set_index('Date', inplace=True) allwords = alldata.apply(tokenize_merge, axis=1) return pd.concat([alldata.loc[:, 'Label'], allwords], axis=1, keys=['label', 'words'])