示例#1
0
def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words
示例#2
0
def remove_punc_stopwords_lower(s):
    """
    INPUT: string
    OUTPUT: string

    removes stopwords from the string and lower-cases the string.
    """
    stop = stopwords.words('english')
    regex = r"\W+"
    return " ".join([i for i in re.split(regex, s.lower()) if i not in stop])
def generate_unigrams(text):

    list_of_c_punctuation = string.punctuation
    stop_words = stopwords.words('english')
    unigrams = []

    punctuation_removed = [
        char for char in list(text) if char not in list_of_c_punctuation
    ]
    punctuation_removed = ''.join(punctuation_removed)

    return [
        word for word in punctuation_removed.split()
        if word.lower() not in stop_words
    ]
示例#4
0
 def __init__(self, data):
     self.stop = stopwords.words("english")
     self.data = [word_tokenize(word) for word in data]
示例#5
0
from nltk import stopwords

filtered_words = [word for word in word_list if word not in stopwords.words('english')]

def comp(list1, list2):
	walker_count = 0
	nomatches = []
	for i in list1:

		if i in list2:
			walker_count +=1
		else:
			nomatches.append(i)

	return walker_count*1.0/len(list1), nomatches
示例#6
0
	def __init__(self, data):
		self.stop = stopwords.words("english")
		self.data = [word_tokenize(word) for word in data]