def test_tokenise_no(norwegian_language_model, text, expectedoutput): # arange - not needed # act list = tokenise(norwegian_language_model, text) # assert assert list == expectedoutput
def test_tokenise_en(english_language_model, text, expectedoutput): # arange - not needed # act list = tokenise(english_language_model, text) # assert assert list == expectedoutput
def test_retainspaces(english_language_model, text, expectedoutput): # arange - not needed # act tokens = tokenise(english_language_model, text) raw_str = ' '.join(tokens) new_text = retain_spaces(raw_str) # assert assert new_text == expectedoutput
distribution of the words in the text. """ import enlp.visualisation.freq_distribution as viz import enlp.understanding.distributions as dists import enlp.processing.stdtools as nlp import spacy import matplotlib.pyplot as plt plt.close( 'all' ) # very important for read the docs to avoid it crashing due to memory ############################################################################### # Load Spacy's Norwegian language model and the example text langmodel = spacy.load('nb_dep_ud_sm') with open("example_data/no_den_stygge_andungen.txt", "r") as file: text = file.read() ############################################################################### # Make strings into list of tokens and count them word_list = nlp.tokenise(langmodel, text) counts = dists.freq_dist(word_list) ############################################################################### # Visualise fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5)) viz.dist_plot_detailed(counts[:25], ax=ax1) # Detailed distribution of top 25 tokens viz.dist_plot(counts, log=True, ax=ax2) # Full distribution of corpus plt.show()
# Get a list of stopwords to be removed from the text # Get stopwords all_stopwords, stopwords_nb, stopwords_en = get_stopwords() ############################################################################### # Using NLP pipeline class to create a processing workflow. The pipeline shall involve: # - remove punctuation # - remove stopwords # - stem remaining words # Initialise object processed_text = NLPPipeline(langmodel, text) # Run processing as a pipeline processed_text.rm_punctuation().rm_stopwords( stopwords=all_stopwords).nltk_stem_no() ############################################################################# # Compare text strings of firs 80 characters of the original and processed. print('Original: ', text[:80], '...') print('Processed: ', processed_text.text[:80], '...') ############################################################################# # Wordcloud comparison between most common words fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5)) wordcloud_plot(tokenise(langmodel, text), ax=ax1) wordcloud_plot(processed_text.tokenise().tokens, ax=ax2) plt.tight_layout()
print (text) # Load spacy language model langmodel = spacy.load('en_core_web_md') ############################################################################### # Get stopwords: the stopwords function will get all english and norwegian stopwords stopwords_func, stopwords_nb_func, stopwords_en_func = get_stopwords() print (stopwords_en_func[:5]) ############################################################################### # Remove english stopwords processed_text = rm_stopwords(langmodel, text, stopwords_en_func) print (processed_text) ############################################################################### # Now that the stopwords have been removed then the sentences no longer make much sense. Therefore, an alternative to # viewing the text output is to look at the distribution of the remaining text orig_top10 = pd.DataFrame(dists.freq_dist(tokenise(langmodel, text)), columns=['token', 'count']) pr_top10 = pd.DataFrame(dists.freq_dist(tokenise(langmodel, processed_text)), columns=['token', 'count']) print ("ORIGINAL - top 10 words") print (orig_top10.head(10)) print (" ") print ("PROCESSED - top 10 words") print (pr_top10.head(10))