示例#1
0
def test_use_fallback_if_confidence_is_low():
    nlp_without_fallback = spacy.blank("xx")
    nlp_without_fallback.add_pipe(LanguageDetector())
    doc = nlp_without_fallback(poor_quality_text)

    assert doc._.language_score < 0.5

    nlp_with_fallback = spacy.blank("xx")
    nlp_with_fallback.add_pipe(LanguageDetector(threshold=0.5))
    doc = nlp_with_fallback(poor_quality_text)

    assert doc._.language == "xx"
    assert doc._.language_score < 0.5
示例#2
0
def test_detect_doc_language():
    nlp = spacy.blank("xx")
    nlp.add_pipe(LanguageDetector())
    doc = nlp(en_text)

    assert doc._.language == "en"
    assert doc._.language_score >= 0.8
示例#3
0
def test_use_custom_fallback():
    nlp = spacy.blank("xx")
    nlp.add_pipe(LanguageDetector(threshold=0.99, default_language="fr"))
    doc = nlp(en_text)

    assert doc._.language == "fr"
    assert doc._.language_score >= 0.8
示例#4
0
def test_use_fallback_value_if_language_not_supported():
    nlp = spacy.blank("xx")
    nlp.add_pipe(LanguageDetector(supported_languages=["fr"]))
    doc = nlp(en_text)

    assert doc._.language == "xx"
    assert doc._.language_score >= 0.8
示例#5
0
def test_use_custom_model():
    nlp = spacy.blank("xx")
    nlp.add_pipe(
        LanguageDetector(model_path=os.path.realpath(
            os.path.join(__file__, "..", "..", "spacy_fastlang",
                         "lid.176.ftz"))))
    doc = nlp(en_text)

    assert doc._.language == "en"
    assert doc._.language_score >= 0.8
示例#6
0
def book_scraping(
    html_source
):  # this takes the html content and returns a list with the useful info
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe(LanguageDetector())

    soup = BeautifulSoup(
        html_source,
        features='lxml')  # instantiate a BeautifulSoup object for HTML parsing

    bookTitle = soup.find_all(
        'h1', id='bookTitle')[0].contents[0].strip()  # get the book title

    # if bookSeries is not present, then set it to the empty string
    try:
        bookSeries = soup.find_all(
            'h2', id='bookSeries')[0].contents[1].contents[0].strip()[1:-1]
    except:
        bookSeries = ''

    # if bookAuthors is not present, then set it to the empty string
    try:
        bookAuthors = soup.find_all('span',
                                    itemprop='name')[0].contents[0].strip()
    except:
        bookAuthors = ''

    # the plot of the book is essential; if something goes wrong with the plot, raise an error
    try:
        Plot = soup.find_all(
            'div', id='description'
        )[0].contents  # get the main tag where the plot is found
        filter_plot = list(
            filter(lambda i: i != '\n', Plot)
        )  # filter the plot by removing tags that doesn’t contain the description
        if len(filter_plot) == 1:
            Plot = filter_plot[0].text
        else:  # getting all the plot within the tag
            Plot = filter_plot[1].text
        doc = nlp(Plot)
        if doc._.language != 'en':
            raise Exception  # if the plot is not in english, raise an error
    except:
        raise  # pass the error to the caller function

    # if NumberofPages is not present, then set it to the empty string
    try:
        NumberofPages = soup.find_all(
            'span', itemprop='numberOfPages')[0].contents[0].split()[0]
    except:
        NumberofPages = ''

    # if ratingValue is not present, then set it to the empty string
    try:
        ratingValue = soup.find_all(
            'span', itemprop='ratingValue')[0].contents[0].strip()
    except:
        ratingValue = ''

    # if rating_reviews is not present, then set it to the empty string
    try:
        ratings_reviews = soup.find_all('a', href='#other_reviews')
        for i in ratings_reviews:
            if i.find_all('meta', itemprop='ratingCount'):
                ratingCount = i.contents[2].split()[0]
            if i.find_all('meta', itemprop='reviewCount'):
                reviewCount = i.contents[2].split()[0]
    except:
        ratings_reviews = ''

    # if Published is not present, then set it to the empty string
    try:
        pub = soup.find_all('div', class_='row')[1].contents[0].split()[1:4]
        Published = ' '.join(pub)  # join the list of publishers
    except:
        Published = ''

    # if Character is not present, then set it to the empty string
    try:
        char = soup.find_all(
            'a', href=re.compile('characters')
        )  # find the regular expression(re) 'characters' within the attribute href
        if len(char) == 0:
            Characters = ''  # no characters in char
        else:
            Characters = ', '.join([i.contents[0] for i in char])
    except:
        Characters = ''  # something went wrong with char

    # if Setting is not present, then set it to the empty string
    try:
        sett = soup.find_all(
            'a', href=re.compile('places')
        )  # find the regular expression(re) 'places' within the attribute href
        if len(sett) == 0:
            Setting = ''
        else:
            Setting = ', '.join([i.contents[0] for i in sett])
    except:
        Setting = ''  # something went wrong with Setting

    # get the URL to the page
    Url = soup.find_all('link', rel='canonical')[0].get('href')

    return [
        bookTitle, bookSeries, bookAuthors, ratingValue, ratingCount,
        reviewCount, Plot, NumberofPages, Published, Characters, Setting, Url
    ]
示例#7
0
文件: ft.py 项目: Lulzx/sumitup
import spacy
from spacy_fastlang import LanguageDetector

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe(LanguageDetector())
doc = nlp(
    'Life is like a box of chocolates. You never know what you are gonna get.')

assert doc._.language == 'en'
assert doc._.language_score >= 0.8
示例#8
0
def test_batch_predictions():
    nlp = spacy.blank("xx")
    nlp.add_pipe(LanguageDetector())
    for doc in nlp.pipe([en_text, en_text]):
        assert doc._.language == "en"
        assert doc._.language_score >= 0.8
# specifiy max len
nlp.max_length = 50000

# expand on spaCy's stopwords
# --+ my stopwrods
my_stopwords = [
    '\x1c', 'ft', 'wsj', 'time', 'sec', 'say', 'says', 'said', 'mr.', 'mister',
    'mr', 'miss', 'ms', 'inc'
]
# --+ expand on spacy's stopwords
for stopword in my_stopwords:
    nlp.vocab[stopword].is_stop = True

# filter-out non-en pages
language_detector = LanguageDetector()
nlp.add_pipe(language_detector)

# containers
docs_id_tokens = []
docs_ln = []

for _id, doc in zip(df_5_50._id, docs):
    doc = nlp(doc)
    ln = doc._.language
    docs_ln.append([_id, ln])
    if doc._.language == 'en':
        tmp_tokens = [
            token.lemma_ for token in doc
            if not token.is_stop and not token.is_punct and not token.like_num
            and not token.like_url and not token.like_email