# Remove all single character words document = re.sub(r'\s[a-zA-z]{1}\s', ' ', document) # Substituting multiple spaces with single space document = re.sub(r'\s+', ' ', document, flags=re.I) # Tokenizing document = WordPunctTokenizer().tokenize(document) # Remove Stopwords document = [word for word in document if word not in stopset] # Stemming document = [SnowballStemmer('english').stem(t) for t in document] doc_length.append(len(document)) document = ' '.join(document) # Remove all single characters that could have been created due to tokenization document = re.sub(r'\s[a-zA-z]{1}\s', ' ', document) # Editing some words of intrest document = document.replace('bp', 'bloodpressure') document = document.replace('blood pressure', 'bloodpressure') document = document.replace('ordered', 'order') # Substituting multiple spaces with single space document = re.sub(r'\s+', ' ', document, flags=re.I) X.append(document) df['incident'] = X # Most common features after stemming pre-processing tokens = df.incident.str.cat(sep=' ') tokens = WordPunctTokenizer().tokenize( tokens) #shows there are 1,297,146 words in this corpus # shows how many unique words there are unique_words = nltk.Fr___Dist(tokens) # shows 21,116 unique words top_words = unique_words.most_common(50) # ploting the most common words