예제 #1
0
def test_tokenise_no(norwegian_language_model, text, expectedoutput):
    # arange - not needed

    # act
    list = tokenise(norwegian_language_model, text)

    # assert
    assert list == expectedoutput
예제 #2
0
def test_tokenise_en(english_language_model, text, expectedoutput):
    # arange - not needed

    # act
    list = tokenise(english_language_model, text)

    # assert
    assert list == expectedoutput
예제 #3
0
def test_retainspaces(english_language_model, text, expectedoutput):
    # arange - not needed

    # act
    tokens = tokenise(english_language_model, text)
    raw_str = ' '.join(tokens)
    new_text = retain_spaces(raw_str)

    # assert
    assert new_text == expectedoutput
distribution of the words in the text.
"""

import enlp.visualisation.freq_distribution as viz
import enlp.understanding.distributions as dists
import enlp.processing.stdtools as nlp
import spacy
import matplotlib.pyplot as plt

plt.close(
    'all'
)  # very important for read the docs to avoid it crashing due to memory

###############################################################################
# Load Spacy's Norwegian language model and the example text
langmodel = spacy.load('nb_dep_ud_sm')
with open("example_data/no_den_stygge_andungen.txt", "r") as file:
    text = file.read()

###############################################################################
# Make  strings into list of tokens and count them
word_list = nlp.tokenise(langmodel, text)
counts = dists.freq_dist(word_list)

###############################################################################
# Visualise
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
viz.dist_plot_detailed(counts[:25],
                       ax=ax1)  # Detailed distribution of top 25 tokens
viz.dist_plot(counts, log=True, ax=ax2)  # Full distribution of corpus
plt.show()
예제 #5
0
# Get a list of stopwords to be removed from the text

# Get stopwords
all_stopwords, stopwords_nb, stopwords_en = get_stopwords()

###############################################################################
# Using NLP pipeline class to create a processing workflow. The pipeline shall involve:
# - remove punctuation
# - remove stopwords
# - stem remaining words

# Initialise object
processed_text = NLPPipeline(langmodel, text)

# Run processing as a pipeline
processed_text.rm_punctuation().rm_stopwords(
    stopwords=all_stopwords).nltk_stem_no()

#############################################################################
# Compare text strings of firs 80 characters of the original and processed.

print('Original: ', text[:80], '...')
print('Processed: ', processed_text.text[:80], '...')

#############################################################################
# Wordcloud comparison between most common words
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
wordcloud_plot(tokenise(langmodel, text), ax=ax1)
wordcloud_plot(processed_text.tokenise().tokens, ax=ax2)
plt.tight_layout()
예제 #6
0
print (text)

# Load spacy language model
langmodel = spacy.load('en_core_web_md')


###############################################################################
# Get stopwords: the stopwords function will get all english and norwegian stopwords

stopwords_func, stopwords_nb_func, stopwords_en_func = get_stopwords()
print (stopwords_en_func[:5])

###############################################################################
# Remove english stopwords

processed_text = rm_stopwords(langmodel, text, stopwords_en_func)
print (processed_text)

###############################################################################
# Now that the stopwords have been removed then the sentences no longer make much sense. Therefore, an alternative to
# viewing the text output is to look at the distribution of the remaining text


orig_top10 = pd.DataFrame(dists.freq_dist(tokenise(langmodel, text)), columns=['token', 'count'])
pr_top10 = pd.DataFrame(dists.freq_dist(tokenise(langmodel, processed_text)), columns=['token', 'count'])

print ("ORIGINAL - top 10 words")
print (orig_top10.head(10))
print (" ")
print ("PROCESSED - top 10 words")
print (pr_top10.head(10))