示例#1
0
def natural_language_parser(text):
    text = open(f"{text}.txt", encoding='utf-8').read().lower()
    # Split the text into individual sentences and then individual words:
    word_tokenized_text = word_sentence_tokenize(text)
    # Print any tokenized word sentence:
    print(word_tokenized_text[100])
    # Create a list to hold part-of-speech tagged sentences:
    pos_tagged_text = [pos_tag(word) for word in word_tokenized_text]
    # Print any part-of-speech tagged sentence:
    print(pos_tagged_text[100])
    # Define noun phrase chunk grammar:
    np_chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"
    # Create noun phrase RegexpParser object:
    np_chunk_parser = RegexpParser(np_chunk_grammar)
    # Define verb phrase chunk grammar::
    vp_chunk_grammar = "VP: {<DT>?<JJ>*<NN><VB.*><RB.?>?}"
    # Create verb phrase RegexpParser object:
    vp_chunk_parser = RegexpParser(vp_chunk_grammar)
    # List of noun phrase chunked sentences:
    np_chunked_text = []
    # List of verb phrase chunked sentences:
    vp_chunked_text = []
    for sentence in pos_tagged_text:
        np_chunked_text.append(np_chunk_parser.parse(sentence))
        # List of verb phrase chunked sentences:
        vp_chunked_text.append(vp_chunk_parser.parse(sentence))
    # Most commons chunks:
    most_common_np_chunks = np_chunk_counter(np_chunked_text)
    most_common_vp_chunks = vp_chunk_counter(vp_chunked_text)
    print(most_common_np_chunks)
    print(most_common_vp_chunks)
示例#2
0
def process_vader():
    vader_tokenized = word_sentence_tokenize(vader)

    single_sentence_tokenized = vader_tokenized[27]
    print()
    print(f"Vader's single tokenized sentence: {single_sentence_tokenized}")

    pos_tagged_vader = list()

    for sentence in vader_tokenized:
        pos_tagged_vader.append(pos_tag(sentence))

    pos_tagged_sentence = pos_tagged_vader[27]
    print()
    print(
        f"Vader's single part-of-speech tagged sentence: {pos_tagged_sentence}"
    )

    np_chunk_grammar = 'NP: {<DT>?<JJ.?>*<NN>}'
    np_chunk_parser = RegexpParser(np_chunk_grammar)

    vp_chunk_grammar = 'VP: {<DT>?<JJ.?>*<NN><VB.?>((<RB.?>)|(<DT>?<JJ.?>*<NN>)|(<IN><DT>?<JJ.?>*<NN>))*}'
    vp_chunk_parser = RegexpParser(vp_chunk_grammar)

    np_chunked_vader = list()
    vp_chunked_vader = list()

    for sentence in pos_tagged_vader:
        np_chunked_vader.append(np_chunk_parser.parse(sentence))
        vp_chunked_vader.append(vp_chunk_parser.parse(sentence))

    top_np_chunks = np_chunk_counter(np_chunked_vader)
    top_vp_chunks = vp_chunk_counter(vp_chunked_vader)

    print()
    print("Vader's most-commonly used noun-phrases:")
    print(*top_np_chunks, sep='\n')
    print()
    print(f"Vader's most-commonly used verb-phrases:")
    print(*top_vp_chunks, sep='\n')
示例#3
0
from nltk import pos_tag, RegexpParser
from tokenize_words import word_sentence_tokenize
from chunk_counters import np_chunk_counter, vp_chunk_counter

# import text of choice here
text = open('dorian_gray.txt', encoding='utf-8').read().lower()

# sentence and word tokenize text here
word_tokenized_text = word_sentence_tokenize(text)

# store and print any word tokenized sentence here
single_word_tokenized_sentence = word_tokenized_text[115]
#print(single_word_tokenized_sentence)

# create a list to hold part-of-speech tagged sentences here
pos_tagged_text = list()

# create a for loop through each word tokenized sentence here
for word_tokenized_sentence in word_tokenized_text:
    pos_tagged_text.append(pos_tag(word_tokenized_sentence))
    # part-of-speech tag each sentence and append to list of pos-tagged sentences here

# store and print any part-of-speech tagged sentence here
single_pos_sentence = pos_tagged_text[150]
#print(single_pos_sentence)

# define noun phrase chunk grammar here
np_chunk_grammar = "NP: {<DT>?<JJ>*<NN>}"

# create noun phrase RegexpParser object here
np_chunk_parser = RegexpParser(np_chunk_grammar)
示例#4
0
with open(r"data/male_text.txt", "w+") as maletext:
    maletext.write(male_text)

with open(r"data/female_text.txt", "w+") as femaletext:
    femaletext.write(female_text)

with open(r"data/genderless_text.txt", "w+") as genderlesstext:
    genderlesstext.write(genderless_text)

# with open(r"data/combined_text.txt") as combinedtext:
#     for line in combinedtext:
#         text += line + '\n'

word_tokenized_male_text = word_sentence_tokenize(
    male_text
)  #this function is from tokenize_words.py, used to tokenize words.
word_tokenized_female_text = word_sentence_tokenize(female_text)
word_tokenized_genderless_text = word_sentence_tokenize(genderless_text)
print("Word Tokenization Done" + '\n')

pos_tagged_male_text = []
pos_tagged_female_text = []
pos_tagged_genderless_text = []

i = 0

with open(r"data/male_tagged.txt", "w+") as maletxt:
    for sentence in word_tokenized_male_text:
        tagged = pos_tag(sentence)
        pos_tagged_male_text.append(tagged)