예제 #1
0
def test_get_stopwords_norwegian():
    # arrange - get stopwords outwith function
    stops_nb_direct = list(STOP_WORDS_NB)

    # act - get functions idea of stopwords
    stopwords_func, stopwords_nb_func, _ = get_stopwords()

    # assert
    assert stops_nb_direct == stopwords_nb_func
예제 #2
0
def test_get_stopwords_english():
    # arrange - get stopwords outwith function
    stops_en_direct = list(STOP_WORDS_EN)

    # act - get functions idea of stopwords
    stopwords_func, stopwords_nb_func, stopwords_en_func = get_stopwords()

    # assert
    assert stops_en_direct == stopwords_en_func
예제 #3
0
def test_get_stopwords_english():
    # arrange - get stopwords outwith function
    from spacy.lang.en.stop_words import STOP_WORDS
    stops_en_direct = list(STOP_WORDS)

    # act - get functions idea of stopwords
    stopwords_func, stopwords_nb_func, stopwords_en_func = get_stopwords()

    # assert
    assert stops_en_direct == stopwords_en_func
예제 #4
0
def test_get_stopwords_full():
    # arrange - get stopwords outwith function
    stops_en_direct = list(STOP_WORDS_EN)
    stops_nb_direct = list(STOP_WORDS_NB)

    stopwords = stops_en_direct + stops_nb_direct
    stopwords_direct = [str(i) for i in stopwords]

    # act - get functions idea of stopwords
    stopwords_func, stopwords_nb_func, _ = get_stopwords()

    # assert
    assert stopwords_direct == stopwords_func
예제 #5
0
def test_get_stopwords_full():
    # arrange - get stopwords outwith function
    from spacy.lang.en.stop_words import STOP_WORDS
    stops_en_direct = list(STOP_WORDS)

    from spacy.lang.nb.stop_words import STOP_WORDS
    stops_nb_direct = list(STOP_WORDS)

    stopwords = stops_en_direct + stops_nb_direct
    stopwords_direct = [str(i) for i in stopwords]

    # act - get functions idea of stopwords
    stopwords_func, stopwords_nb_func, _ = get_stopwords()

    # assert
    assert stopwords_direct == stopwords_func
예제 #6
0
def test_keyphrase_extraction():
    # arange
    example_text = './examples/example_data/en_historynlp.txt'
    with open(example_text, "r") as file:
        text = file.read()

    all_stopwords, stopwords_nb, stopwords_en = stdt.get_stopwords()

    # act
    keyphrases = kw.keyphrase_list(
        text,
        stopwords=stopwords_en,
        with_scores=False,
    )

    # assert
    assert keyphrases[0] == "word embeddings"
예제 #7
0
NOTE: This is an example to show how to run the procedure however due to the small dataset used
the results are likely to be non-sensical.
"""

import enlp.understanding.topics as tp
import enlp.processing.stdtools as stdt
import spacy

###############################################################################
# Load example text and get stopwords

with open("example_data/en_nlptexts.txt", "r") as file:
    text=file.read()

all_stopwords, stopwords_nb, stopwords_en = stdt.get_stopwords()


###############################################################################
# Preprocess text - for this example we have a very small corpus to allow the documentation
# to build therefore we will split the single document into paragraphs for processing to
# imitate multiple document input and we will also remove stopwords and punctuation  as the text is too small.

# Split text into paragraphs to imitate documents
docs = text.split('\n\n')
# Remove \n and replace with space
docs = [d.replace('\n',' ') for d in docs]

# Because example text is small, remove stopwords and punctuation
en = spacy.load('en_core_web_md')
stopwords, stops_nb, stops_en = stdt.get_stopwords()
예제 #8
0
"""
Removing Stopwords
==================
XXX
"""

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as pltgs

from enlp.processing.stdtools import get_stopwords

plt.close(
    'all'
)  # very important for read the docs to avoid it crashing due to memory

###############################################################################
# Some text

# act - get functions idea of stopwords
stopwords_func, stopwords_nb_func, stopwords_en_func = get_stopwords()

print(stopwords_en_func[:5])
예제 #9
0
"""
Keyphrase Extraction (English)
==============================
The following example uses a python implementation of the Rapid Automatic Keyword Extraction algorithm to extract
keyphrases from a text.
"""

import pandas as pd
from enlp.processing.stdtools import get_stopwords
from enlp.understanding.keywords import keyphrase_list

###############################################################################
# Load example text and get stopwords

with open("example_data/en_historynlp.txt", "r") as file:
    text = file.read()

all_stopwords, stopwords_nb, stopwords_en = get_stopwords()

###############################################################################
# Extract keyphrases

keyphrases = keyphrase_list(
    text,
    stopwords=stopwords_en,
)

print(pd.DataFrame(keyphrases, columns=['score', 'keyphrase']).head(10))
예제 #10
0
def all_stopwords():
    stopwords, stops_nb, stops_en = get_stopwords()
    return stopwords
예제 #11
0
def norwegian_stopwords():
    stopwords, stops_nb, stops_en = get_stopwords()
    return stops_nb
예제 #12
0
def english_stopwords():
    stopwords, stops_nb, stops_en = get_stopwords()
    return stops_en