def lemmatizer(report): lemmatizer = nltk.WordNetLemmatizer() lemmatized = [ lemmatizer.lemmatize(token, get_part_of_speech(token)) for token in report ] return lemmatized
def count_words(text): cleaned = re.sub('\W+', ' ', text).lower() tokenized = word_tokenize(cleaned) stop_words = stopwords.words('english') filtered = [word for word in tokenized if word not in stop_words] normalizer = WordNetLemmatizer() normalized = [ normalizer.lemmatize(token, get_part_of_speech(token)) for token in filtered ] bag_of_looking_glass_words = Counter(normalized) return bag_of_looking_glass_words
from nltk.stem import PorterStemmer from nltk.stem import WordNetLemmatizer ###### grabbing a part of speech function: from part_of_speech import get_part_of_speech text = "So many squids are jumping out of suitcases these days that you can barely go anywhere without seeing one burst forth from a tightly packed valise. I went to the dentist the other day, and sure enough I saw an angry one jump out of my dentist's bag within minutes of arriving. She hardly even noticed." cleaned = re.sub('\W+', ' ', text) tokenized = word_tokenize(cleaned) stemmer = PorterStemmer() stemmed = [stemmer.stem(token) for token in tokenized] lemmatizer = WordNetLemmatizer() lemmatized = [ lemmatizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized ] print("Stemmed text:") print(stemmed) print("\nLemmatized text:") print(lemmatized) ##### print('\n##################################################\n') ##### """ It may be helpful to know how the words relate to each other and the underlying syntax (grammar). Parsing is a stage of NLP concerned with segmenting text based on syntax. Part-of-speech tagging (POS tagging) identifies parts of speech (verbs, nouns, adjectives, etc.). NLTK can do it faster (and maybe more accurately) than your grammar teacher!
# regex for removing punctuation! import re # nltk preprocessing magic import nltk from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer from nltk.stem import WordNetLemmatizer # grabbing a part of speech function: from part_of_speech import get_part_of_speech text = "So many squids are jumping out of suitcases these days that you can barely go anywhere without seeing one burst forth from a tightly packed valise. I went to the dentist the other day, and sure enough I saw an angry one jump out of my dentist's bag within minutes of arriving. She hardly even noticed." cleaned = re.sub('\W+', ' ', text) tokenized = word_tokenize(cleaned) stemmer = PorterStemmer() stemmed = [stemmer.stem(token) for token in tokenized] lemmatizer = WordNetLemmatizer() lemmatized = [lemmatizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized] print("Stemmed text:") print(stemmed) print("\nLemmatized text:") print(lemmatized)
from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer from part_of_speech import get_part_of_speech lemmatizer = WordNetLemmatizer() populated_island = 'Indonesia was founded in 1945. It contains the most populated island in the world, Java, with over 140 million people.' tokenized_string = word_tokenize(populated_island) lemmatized_pos = [lemmatizer.lemmatize(token, get_part_of_speech(token)) for token in tokenized_string] try: print(f'The lemmatized words are: {lemmatized_pos}') except: print('Expected a variable called `lemmatized_pos`')
# importing regex and nltk import re, nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import WordNetLemmatizer # importing Counter to get word counts for bag of words from collections import Counter # import the text text = open("iliad.txt", encoding='utf-8').read().lower() # importing part-of-speech function for lemmatization from part_of_speech import get_part_of_speech cleaned = re.sub('\W+', ' ', text).lower() tokenized = word_tokenize(cleaned) stop_words = stopwords.words('english') filtered = [word for word in tokenized if word not in stop_words] normalizer = WordNetLemmatizer() normalized = [normalizer.lemmatize(token, get_part_of_speech(token)) for token in filtered] # Define bag_of_looking_glass_words & print: bag_of_looking_glass_words = Counter(normalized) # print(bag_of_looking_glass_words) # print(type(bag_of_looking_glass_words)) print(bag_of_looking_glass_words)