Пример #1
0
def prepare_stopwords():
    NEGATE = ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
     "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
     "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
     "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
     "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere","no",
     "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent",
     "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
     "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"]

    stopwords = STOP_WORDS.copy()
    for word in STOP_WORDS:
        if word in NEGATE:
            stopwords.remove(word)

    return stopwords
Пример #2
0
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_md')
domain_stop_words = ['chapter', '<', '>', ';', 'vinegar', 'of', '%']
for word in domain_stop_words:
    STOP_WORDS.add(word)
STOP_WORDS1 = STOP_WORDS.copy()
STOP_WORDS1.discard('other')


def nlp0(sentence):
    sentence = sentence.lower()

    word_list = [
        token.lemma_ for token in nlp(sentence)
        if not token.is_stop and not token.is_punct
    ]

    return word_list


def nlp1(sentence):
    sentence = sentence.lower()
    word_list = [
        str(token.lemma_) for token in nlp(sentence)
        if str(token) not in STOP_WORDS1 and not token.is_punct
    ]
    word_list1 = []
    flag = 0
    for i in word_list:
        if i == 'other':