def natural_language_parser(text): text = open(f"{text}.txt", encoding='utf-8').read().lower() # Split the text into individual sentences and then individual words: word_tokenized_text = word_sentence_tokenize(text) # Print any tokenized word sentence: print(word_tokenized_text[100]) # Create a list to hold part-of-speech tagged sentences: pos_tagged_text = [pos_tag(word) for word in word_tokenized_text] # Print any part-of-speech tagged sentence: print(pos_tagged_text[100]) # Define noun phrase chunk grammar: np_chunk_grammar = "NP: {<DT>?<JJ>*<NN>}" # Create noun phrase RegexpParser object: np_chunk_parser = RegexpParser(np_chunk_grammar) # Define verb phrase chunk grammar:: vp_chunk_grammar = "VP: {<DT>?<JJ>*<NN><VB.*><RB.?>?}" # Create verb phrase RegexpParser object: vp_chunk_parser = RegexpParser(vp_chunk_grammar) # List of noun phrase chunked sentences: np_chunked_text = [] # List of verb phrase chunked sentences: vp_chunked_text = [] for sentence in pos_tagged_text: np_chunked_text.append(np_chunk_parser.parse(sentence)) # List of verb phrase chunked sentences: vp_chunked_text.append(vp_chunk_parser.parse(sentence)) # Most commons chunks: most_common_np_chunks = np_chunk_counter(np_chunked_text) most_common_vp_chunks = vp_chunk_counter(vp_chunked_text) print(most_common_np_chunks) print(most_common_vp_chunks)
def process_vader(): vader_tokenized = word_sentence_tokenize(vader) single_sentence_tokenized = vader_tokenized[27] print() print(f"Vader's single tokenized sentence: {single_sentence_tokenized}") pos_tagged_vader = list() for sentence in vader_tokenized: pos_tagged_vader.append(pos_tag(sentence)) pos_tagged_sentence = pos_tagged_vader[27] print() print( f"Vader's single part-of-speech tagged sentence: {pos_tagged_sentence}" ) np_chunk_grammar = 'NP: {<DT>?<JJ.?>*<NN>}' np_chunk_parser = RegexpParser(np_chunk_grammar) vp_chunk_grammar = 'VP: {<DT>?<JJ.?>*<NN><VB.?>((<RB.?>)|(<DT>?<JJ.?>*<NN>)|(<IN><DT>?<JJ.?>*<NN>))*}' vp_chunk_parser = RegexpParser(vp_chunk_grammar) np_chunked_vader = list() vp_chunked_vader = list() for sentence in pos_tagged_vader: np_chunked_vader.append(np_chunk_parser.parse(sentence)) vp_chunked_vader.append(vp_chunk_parser.parse(sentence)) top_np_chunks = np_chunk_counter(np_chunked_vader) top_vp_chunks = vp_chunk_counter(vp_chunked_vader) print() print("Vader's most-commonly used noun-phrases:") print(*top_np_chunks, sep='\n') print() print(f"Vader's most-commonly used verb-phrases:") print(*top_vp_chunks, sep='\n')
from nltk import pos_tag, RegexpParser from tokenize_words import word_sentence_tokenize from chunk_counters import np_chunk_counter, vp_chunk_counter # import text of choice here text = open('dorian_gray.txt', encoding='utf-8').read().lower() # sentence and word tokenize text here word_tokenized_text = word_sentence_tokenize(text) # store and print any word tokenized sentence here single_word_tokenized_sentence = word_tokenized_text[115] #print(single_word_tokenized_sentence) # create a list to hold part-of-speech tagged sentences here pos_tagged_text = list() # create a for loop through each word tokenized sentence here for word_tokenized_sentence in word_tokenized_text: pos_tagged_text.append(pos_tag(word_tokenized_sentence)) # part-of-speech tag each sentence and append to list of pos-tagged sentences here # store and print any part-of-speech tagged sentence here single_pos_sentence = pos_tagged_text[150] #print(single_pos_sentence) # define noun phrase chunk grammar here np_chunk_grammar = "NP: {<DT>?<JJ>*<NN>}" # create noun phrase RegexpParser object here np_chunk_parser = RegexpParser(np_chunk_grammar)
with open(r"data/male_text.txt", "w+") as maletext: maletext.write(male_text) with open(r"data/female_text.txt", "w+") as femaletext: femaletext.write(female_text) with open(r"data/genderless_text.txt", "w+") as genderlesstext: genderlesstext.write(genderless_text) # with open(r"data/combined_text.txt") as combinedtext: # for line in combinedtext: # text += line + '\n' word_tokenized_male_text = word_sentence_tokenize( male_text ) #this function is from tokenize_words.py, used to tokenize words. word_tokenized_female_text = word_sentence_tokenize(female_text) word_tokenized_genderless_text = word_sentence_tokenize(genderless_text) print("Word Tokenization Done" + '\n') pos_tagged_male_text = [] pos_tagged_female_text = [] pos_tagged_genderless_text = [] i = 0 with open(r"data/male_tagged.txt", "w+") as maletxt: for sentence in word_tokenized_male_text: tagged = pos_tag(sentence) pos_tagged_male_text.append(tagged)