Exemplo n.º 1
0
tokens_filtered = []
for token in tokens:
    if token in stopwords.words('english'):
        tokens_filtered += ["*"]
    else:
        tokens_filtered += [ token ]

# stemming
#normalized_tokens = [stemmer.stem(token) for token in tokens if token not in stopwords.words('english')]
normalized_tokens = [stemmer.stem(token) for token in tokens_filtered]

print "Tokens set filtered and stemmed :", normalized_tokens

window_size = 10
matrix = WordMatrix()

win_start = 0
while win_start + window_size <= len(normalized_tokens):
    window = normalized_tokens[win_start:win_start + window_size]
    first = 0
    second = 1
    while first < len(window):
        second = first + 1
        while second < len(window):
            matrix.add(window[first], window[second], window_size - second + first + 1)
            second += 1
        first += 1
    win_start += 1

print "Co-occurence counted"
Exemplo n.º 2
0
tokens_filtered = []
for token in tokens:
    if token in stopwords.words('english'):
        tokens_filtered += ["*"]
    else:
        tokens_filtered += [token]

# stemming
#normalized_tokens = [stemmer.stem(token) for token in tokens if token not in stopwords.words('english')]
normalized_tokens = [stemmer.stem(token) for token in tokens_filtered]

print "Tokens set filtered and stemmed :", normalized_tokens

window_size = 10
matrix = WordMatrix()

win_start = 0
while win_start + window_size <= len(normalized_tokens):
    window = normalized_tokens[win_start:win_start + window_size]
    first = 0
    second = 1
    while first < len(window):
        second = first + 1
        while second < len(window):
            matrix.add(window[first], window[second],
                       window_size - second + first + 1)
            second += 1
        first += 1
    win_start += 1
Exemplo n.º 3
0
tokens_filtered = []
for token in tokens:
    if token in stopwords.words('english'):
        tokens_filtered += ["*"]
    else:
        tokens_filtered += [ token ]

# stemming
#normalized_tokens = [stemmer.stem(token) for token in tokens if token not in stopwords.words('english')]
normalized_tokens = [stemmer.stem(token) for token in tokens_filtered]

print "Tokens set filtered and stemmed :", normalized_tokens

window_size = 10
matrix = WordMatrix()

win_start = 0
while win_start + window_size <= len(normalized_tokens):
    window = normalized_tokens[win_start:win_start + window_size]
    first = 0
    second = 1
    while first < len(window):
        second = first + 1
        while second < len(window):
            matrix.add(window[first], window[second], window_size - second + first + 1)
            second += 1
        first += 1
    win_start += 1

print "Co-occurence counted"
Exemplo n.º 4
0
print

# maybe should first do sent_tokenize, then word_tokenize
tokens = word_tokenize(text)
normalized_tokens = []

# i chose the one everybody knows
stemmer = PorterStemmer()

# tokenization and stemming
for token in tokens:
    normalized_tokens += [stemmer.stem(token)]

window_size = 5

matrix = WordMatrix()

win_start = 0
while win_start + window_size <= len(normalized_tokens):
    window = normalized_tokens[win_start : win_start + window_size]
    first = 0
    second = 1
    while first < len(window):
        second = first + 1
        while second < len(window):
            matrix.add(window[first], window[second], window_size - second + first + 1)
            second += 1
        first += 1
    win_start += 1

# todo: tabs stuff, cool printing
Exemplo n.º 5
0
print

# maybe should first do sent_tokenize, then word_tokenize
tokens = word_tokenize(text)
normalized_tokens = []

# i chose the one everybody knows
stemmer = PorterStemmer()

# tokenization and stemming
for token in tokens:
    normalized_tokens += [stemmer.stem(token)]

window_size = 5

matrix = WordMatrix()

win_start = 0
while win_start + window_size <= len(normalized_tokens):
    window = normalized_tokens[win_start:win_start + window_size]
    first = 0
    second = 1
    while first < len(window):
        second = first + 1
        while second < len(window):
            matrix.add(window[first], window[second],
                       window_size - second + first + 1)
            second += 1
        first += 1
    win_start += 1