def tiny_tokenize_xml(text, stem=False, stop_words=[]): return [ EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize( re.sub('[%s]' % re.escape(string.punctuation), ' ', text.encode(encoding='ascii', errors='ignore'))) if not token.isdigit() and not token in stop_words ]
def _bow(train, test, max_features=1000): """ bag-of-words encoding helper function """ from sklearn.feature_extraction.text import CountVectorizer from nltk.stem import WordNetLemmatizer from nltk.stem.porter import PorterStemmer as EnglishStemmer from nltk.tokenize import wordpunct_tokenize as wordpunct_tokenize x_train, y_train = train x_test, y_test = test stemmer = EnglishStemmer() lemmatizer = WordNetLemmatizer() for i in range(len(x_train)): x_train[i] = " ".join([ lemmatizer.lemmatize(stemmer.stem(token.lower())) for token in wordpunct_tokenize( re.sub("[%s]" % re.escape(string.punctuation), "", x_train[i])) ]) vectorizer_train = CountVectorizer( strip_accents="ascii", stop_words="english", token_pattern=r"(?u)\b\w[a-z]\w+[a-z]\b", max_features=max_features, vocabulary=None, dtype="float32", ) x_train = vectorizer_train.fit_transform(x_train).toarray() vocab_train = vectorizer_train.get_feature_names() vectorizer_test = CountVectorizer( strip_accents="ascii", stop_words="english", token_pattern=r"(?u)\b\w[a-z]\w+[a-z]\b", max_features=max_features, vocabulary=vocab_train, dtype="float32", ) x_test = vectorizer_test.fit_transform(x_test).toarray() # remove documents with no words r = np.where(x_train.sum(axis=1) > 0.0)[0] x_train = x_train[r, :] y_train = y_train[r] r = np.where(x_test.sum(axis=1) > 0.0)[0] x_test = x_test[r, :] y_test = y_test[r] return (x_train, y_train), (x_test, y_test), vocab_train
def tiny_tokenize(text, stem=False, stop_words=[]): words = [] for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \ text)): if not token.isdigit() and not token in stop_words: if stem: try: w = EnglishStemmer().stem(token) except Exception as e: w = token else: w = token words.append(w) return words
def tiny_tokenize(text, stem=False, stop_words=[]): words = [] for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \ text.decode(encoding='UTF-8', errors='ignore'))): if (not token.isdigit()) and (not token in stop_words): if stem: try: w = EnglishStemmer().stem(token) except Exception as e: w = token else: w = token words.append(w) return words
def __stem_doc(doc_details): # Import nltk tools from nltk.tokenize import wordpunct_tokenize as wordpunct_tokenize # from nltk.stem.snowball import EnglishStemmer from nltk.stem.porter import PorterStemmer as EnglishStemmer idx, doc = doc_details if idx % 100 == 0: print "Processed doc " + str(idx) if doc.endswith('.txt'): d = open(doc).read() stemmer = EnglishStemmer() # This method only works for english documents. # Stem, lowercase, substitute all punctuations, remove stopwords. attribute_names = [stemmer.stem(token.lower()) for token in wordpunct_tokenize( re.sub('[%s]' % re.escape(string.punctuation), '', d.decode(encoding='UTF-8', errors='ignore'))) if token.lower() not in stopwords.get_stopwords()] s.dump(attribute_names, open(doc.replace(".txt", ".p"), "wb"))
def clean_and_tokenize(stem): para = readingfiles() index = np.arange(len(para)) np.random.shuffle(index) new_sents = [] for i in index.tolist(): new_sents.append(para[i]) sents = new_sents word_freqs = collections.Counter() sent_lens = [] parsed_sentences = [] # lst_punctuation = str.maketrans('', '', string.punctuation) for sent in sents: sent = re.sub('[^a-zA-Z]', ' ', str(sent)) sent = re.sub(' +', ' ', sent) parsed_words = [] # print(sent) # print(len(sent.strip(' '))) for word in nltk.word_tokenize( re.sub('[%s]' % re.escape(string.punctuation), '', sent)): if ~is_number(word) and word.strip().lower( ) not in stop_words and word.isalpha() and len(word) > 2: if stem: try: w = EnglishStemmer().stem(word) if w in stop_words: w = '' w = re.sub(' +', '', w) except Exception as e: w = word.strip().lower() else: w = word.strip().lower() word_freqs[w] += 1 parsed_words.append(w) # print(len(parsed_words)) # print(parsed_words) if len(parsed_words) > 3: sent_lens.append(len(parsed_words)) parsed_sentences.append(" ".join(parsed_words)) # parsed_sentences.append(parsed_words) return sent_lens, parsed_sentences, word_freqs
def load_data(corpus_path, recursive=False): word_freq = defaultdict(lambda: 0) # count the number of times a word appears in a corpus doc_word_freq = defaultdict(dict) # count the number of times a word appears in a doc files = get_all_files(corpus_path, recursive) # word_tokenizer = RegexpTokenizer(r'[a-zA-Z]+') # match only alphabet characters stemmer = EnglishStemmer() # This method only works for english text try: stopword_path = 'patterns/english_stopwords.txt' cached_stop_words = load_stopwords(os.path.join(os.path.split(__file__)[0], stopword_path)) print 'Loaded %s' % stopword_path except: from nltk.corpus import stopwords cached_stop_words = stopwords.words("english") print 'Loaded nltk.corpus.stopwords' for filename in files: try: with open(filename, 'r') as fp: text = fp.read().lower() # words = [word for word in word_tokenizer.tokenize(text) if word not in cached_stop_words] # remove punctuations, stopwords and *unnecessary digits*, stemming words = [stemmer.stem(token) for token in wordpunct_tokenize( re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if not token.isdigit() and not token in cached_stop_words] for i in range(len(words)): # doc-word frequency # doc_name = os.path.basename(filename) parent_name, child_name = os.path.split(filename) doc_name = os.path.split(parent_name)[-1] + '_' + child_name try: doc_word_freq[doc_name][words[i]] += 1 except: doc_word_freq[doc_name][words[i]] = 1 # word frequency word_freq[words[i]] += 1 except Exception as e: raise e return word_freq, doc_word_freq
def stem_acceptance_list(path): """ Stem the acceptance list given by the path. This should be done before data preparation for that specific list. @param path: The path to the acceptance list. """ global EnglishStemmer from nltk.stem.porter import PorterStemmer as ES EnglishStemmer = ES acceptance_lst = open(path).read().replace(" ", "").split("\n") stemmer = EnglishStemmer() acceptance_lst_stemmed = [] for word in acceptance_lst: acceptance_lst_stemmed.append(stemmer.stem(word.lower())) f = open(env_paths.get_acceptance_lst_path(), 'w') for w in acceptance_lst_stemmed[:-1]: f.write(w + "\n") f.write(acceptance_lst_stemmed[-1]) f.close()