tokens_filtered = [] for token in tokens: if token in stopwords.words('english'): tokens_filtered += ["*"] else: tokens_filtered += [ token ] # stemming #normalized_tokens = [stemmer.stem(token) for token in tokens if token not in stopwords.words('english')] normalized_tokens = [stemmer.stem(token) for token in tokens_filtered] print "Tokens set filtered and stemmed :", normalized_tokens window_size = 10 matrix = WordMatrix() win_start = 0 while win_start + window_size <= len(normalized_tokens): window = normalized_tokens[win_start:win_start + window_size] first = 0 second = 1 while first < len(window): second = first + 1 while second < len(window): matrix.add(window[first], window[second], window_size - second + first + 1) second += 1 first += 1 win_start += 1 print "Co-occurence counted"
tokens_filtered = [] for token in tokens: if token in stopwords.words('english'): tokens_filtered += ["*"] else: tokens_filtered += [token] # stemming #normalized_tokens = [stemmer.stem(token) for token in tokens if token not in stopwords.words('english')] normalized_tokens = [stemmer.stem(token) for token in tokens_filtered] print "Tokens set filtered and stemmed :", normalized_tokens window_size = 10 matrix = WordMatrix() win_start = 0 while win_start + window_size <= len(normalized_tokens): window = normalized_tokens[win_start:win_start + window_size] first = 0 second = 1 while first < len(window): second = first + 1 while second < len(window): matrix.add(window[first], window[second], window_size - second + first + 1) second += 1 first += 1 win_start += 1
print # maybe should first do sent_tokenize, then word_tokenize tokens = word_tokenize(text) normalized_tokens = [] # i chose the one everybody knows stemmer = PorterStemmer() # tokenization and stemming for token in tokens: normalized_tokens += [stemmer.stem(token)] window_size = 5 matrix = WordMatrix() win_start = 0 while win_start + window_size <= len(normalized_tokens): window = normalized_tokens[win_start : win_start + window_size] first = 0 second = 1 while first < len(window): second = first + 1 while second < len(window): matrix.add(window[first], window[second], window_size - second + first + 1) second += 1 first += 1 win_start += 1 # todo: tabs stuff, cool printing
print # maybe should first do sent_tokenize, then word_tokenize tokens = word_tokenize(text) normalized_tokens = [] # i chose the one everybody knows stemmer = PorterStemmer() # tokenization and stemming for token in tokens: normalized_tokens += [stemmer.stem(token)] window_size = 5 matrix = WordMatrix() win_start = 0 while win_start + window_size <= len(normalized_tokens): window = normalized_tokens[win_start:win_start + window_size] first = 0 second = 1 while first < len(window): second = first + 1 while second < len(window): matrix.add(window[first], window[second], window_size - second + first + 1) second += 1 first += 1 win_start += 1