예제 #1
0
파일: review.py 프로젝트: stasi009/MyKaggle
class Review(object):

    StopWords = text_utility.make_stop_words()

    def __init__(self, id=None, text=None, is_positive=None):
        self.id = id
        self.sent = None if text is None else Sentence.from_raw(
            text, Review.StopWords, neg_mark=True)
        if self.sent is not None:
            self.sent.sentiment = is_positive

    def to_dict(self):
        return {
            '_id': self.id,
            'text': self.sent.raw,
            'words': self.sent.words,
            'is_positive': self.sent.sentiment
        }

    @staticmethod
    def from_dict(d):
        r = Review()
        r.id = d.get("_id", None)

        r.sent = Sentence()
        r.sent.raw = d.get('text', None)
        r.sent.words = d.get('words', None)
        r.sent.sentiment = d.get('is_positive', None)

        return r
예제 #2
0
def test_sentence():
    stopwords = text_utility.make_stop_words()

    texts = [
        "can't is a contraction", "she isn't my wife any more",
        "I am not in USA right now", "I'm a Chinese",
        "1630 NE Valley Rd, Pullman, WA, 99163, Apt X103",
        "I should've done that thing I didn't do", "I don't love her any more",
        "I want to divorce without hesitation", "bye, Pullman, bye, USA"
    ]

    for index, text in enumerate(texts):
        sent = Sentence.from_raw(text, stopwords, True)
        print "\n******************** {}".format(index + 1)

        print sent.raw
        print "===>"
        print sent.words
예제 #3
0
파일: tests.py 프로젝트: stasi009/MyKaggle
def test_sentence():
    stopwords = text_utility.make_stop_words()

    texts = [   "can't is a contraction",
                "she isn't my wife any more",
                "I am not in USA right now",
                "I'm a Chinese",
                "1630 NE Valley Rd, Pullman, WA, 99163, Apt X103",
                "I should've done that thing I didn't do",
                "I don't love her any more",
                "I want to divorce without hesitation",
                "bye, Pullman, bye, USA"]

    for index,text in enumerate(texts):
        sent = Sentence.from_raw(text,stopwords,True)
        print "\n******************** {}".format(index+1)

        print sent.raw
        print "===>"
        print sent.words
예제 #4
0
import cPickle
import numpy as np
import pandas as pd
from sentence import Sentence
import text_utility

StopWords = text_utility.make_stop_words()

def tokenize_merge(row):
    allwords = []
    for text in row.iloc[1:].dropna():
        text = text.lstrip("b\'").lstrip("b\"").lstrip("b\'''")
        s = Sentence.from_raw(text,StopWords,neg_mark=True)
        allwords += s.words

    print allwords# show progress
    return allwords

def load_tokenize():
    alldata = pd.read_csv("datas/Combined_News_DJIA.csv")
    alldata['Date'] = pd.to_datetime(alldata.Date)
    alldata.set_index('Date',inplace=True)

    allwords = alldata.apply(tokenize_merge,axis=1)
    return pd.concat( [alldata.loc[:,'Label'],allwords],axis=1, keys=['label','words'])

if __name__ == "__main__":
    df = load_tokenize()

    cutoff_dt = pd.to_datetime('2015-01-01')
예제 #5
0
import cPickle
import numpy as np
import pandas as pd
from sentence import Sentence
import text_utility

StopWords = text_utility.make_stop_words()


def tokenize_merge(row):
    allwords = []
    for text in row.iloc[1:].dropna():
        text = text.lstrip("b\'").lstrip("b\"").lstrip("b\'''")
        s = Sentence.from_raw(text, StopWords, neg_mark=True)
        allwords += s.words

    print allwords  # show progress
    return allwords


def load_tokenize():
    alldata = pd.read_csv("datas/Combined_News_DJIA.csv")
    alldata['Date'] = pd.to_datetime(alldata.Date)
    alldata.set_index('Date', inplace=True)

    allwords = alldata.apply(tokenize_merge, axis=1)
    return pd.concat([alldata.loc[:, 'Label'], allwords],
                     axis=1,
                     keys=['label', 'words'])