def test_get_stopwords_norwegian(): # arrange - get stopwords outwith function stops_nb_direct = list(STOP_WORDS_NB) # act - get functions idea of stopwords stopwords_func, stopwords_nb_func, _ = get_stopwords() # assert assert stops_nb_direct == stopwords_nb_func
def test_get_stopwords_english(): # arrange - get stopwords outwith function stops_en_direct = list(STOP_WORDS_EN) # act - get functions idea of stopwords stopwords_func, stopwords_nb_func, stopwords_en_func = get_stopwords() # assert assert stops_en_direct == stopwords_en_func
def test_get_stopwords_english(): # arrange - get stopwords outwith function from spacy.lang.en.stop_words import STOP_WORDS stops_en_direct = list(STOP_WORDS) # act - get functions idea of stopwords stopwords_func, stopwords_nb_func, stopwords_en_func = get_stopwords() # assert assert stops_en_direct == stopwords_en_func
def test_get_stopwords_full(): # arrange - get stopwords outwith function stops_en_direct = list(STOP_WORDS_EN) stops_nb_direct = list(STOP_WORDS_NB) stopwords = stops_en_direct + stops_nb_direct stopwords_direct = [str(i) for i in stopwords] # act - get functions idea of stopwords stopwords_func, stopwords_nb_func, _ = get_stopwords() # assert assert stopwords_direct == stopwords_func
def test_get_stopwords_full(): # arrange - get stopwords outwith function from spacy.lang.en.stop_words import STOP_WORDS stops_en_direct = list(STOP_WORDS) from spacy.lang.nb.stop_words import STOP_WORDS stops_nb_direct = list(STOP_WORDS) stopwords = stops_en_direct + stops_nb_direct stopwords_direct = [str(i) for i in stopwords] # act - get functions idea of stopwords stopwords_func, stopwords_nb_func, _ = get_stopwords() # assert assert stopwords_direct == stopwords_func
def test_keyphrase_extraction(): # arange example_text = './examples/example_data/en_historynlp.txt' with open(example_text, "r") as file: text = file.read() all_stopwords, stopwords_nb, stopwords_en = stdt.get_stopwords() # act keyphrases = kw.keyphrase_list( text, stopwords=stopwords_en, with_scores=False, ) # assert assert keyphrases[0] == "word embeddings"
NOTE: This is an example to show how to run the procedure however due to the small dataset used the results are likely to be non-sensical. """ import enlp.understanding.topics as tp import enlp.processing.stdtools as stdt import spacy ############################################################################### # Load example text and get stopwords with open("example_data/en_nlptexts.txt", "r") as file: text=file.read() all_stopwords, stopwords_nb, stopwords_en = stdt.get_stopwords() ############################################################################### # Preprocess text - for this example we have a very small corpus to allow the documentation # to build therefore we will split the single document into paragraphs for processing to # imitate multiple document input and we will also remove stopwords and punctuation as the text is too small. # Split text into paragraphs to imitate documents docs = text.split('\n\n') # Remove \n and replace with space docs = [d.replace('\n',' ') for d in docs] # Because example text is small, remove stopwords and punctuation en = spacy.load('en_core_web_md') stopwords, stops_nb, stops_en = stdt.get_stopwords()
""" Removing Stopwords ================== XXX """ import numpy as np import matplotlib.pyplot as plt import matplotlib.gridspec as pltgs from enlp.processing.stdtools import get_stopwords plt.close( 'all' ) # very important for read the docs to avoid it crashing due to memory ############################################################################### # Some text # act - get functions idea of stopwords stopwords_func, stopwords_nb_func, stopwords_en_func = get_stopwords() print(stopwords_en_func[:5])
""" Keyphrase Extraction (English) ============================== The following example uses a python implementation of the Rapid Automatic Keyword Extraction algorithm to extract keyphrases from a text. """ import pandas as pd from enlp.processing.stdtools import get_stopwords from enlp.understanding.keywords import keyphrase_list ############################################################################### # Load example text and get stopwords with open("example_data/en_historynlp.txt", "r") as file: text = file.read() all_stopwords, stopwords_nb, stopwords_en = get_stopwords() ############################################################################### # Extract keyphrases keyphrases = keyphrase_list( text, stopwords=stopwords_en, ) print(pd.DataFrame(keyphrases, columns=['score', 'keyphrase']).head(10))
def all_stopwords(): stopwords, stops_nb, stops_en = get_stopwords() return stopwords
def norwegian_stopwords(): stopwords, stops_nb, stops_en = get_stopwords() return stops_nb
def english_stopwords(): stopwords, stops_nb, stops_en = get_stopwords() return stops_en