def modelTrainingLexicon(traginingData, testData): print("--Lexicon Model--") tab = [] dataLexiconFeature = [] dataLexiconFeatureT = [] for data in traginingData: booleanNeg = False pos_score = neg_score = obj_score = 0 tagData = pos_tag(data[0]) negationData = mark_negation(data[0]) pos_score, neg_score, obj_score =tagCount(data,tagData,negationData,pos_score,neg_score,obj_score,booleanNeg) total = int(pos_score) - int(neg_score) if (total < 0): overall = 'neg' elif (total > 0): overall = 'pos' elif (total == 0): overall = 'neutre' tab.append(pos_score) tab.append(neg_score) tab.append(obj_score) feats = ({'positive': pos_score, 'negative': neg_score}, data[1]) dataLexiconFeature.append(feats) for dataT in testData: booleanNegT = False pos_scoreT = neg_scoreT = obj_scoreT = 0 tagData = pos_tag(dataT[0]) negationDataT = mark_negation(dataT[0]) pos_scoreT, neg_scoreT, obj_score = tagCount(dataT, tagData, negationDataT, pos_scoreT, neg_scoreT, obj_scoreT, booleanNegT) total = int(pos_scoreT) - int(neg_scoreT) tab.append(pos_scoreT) tab.append(neg_scoreT) tab.append(obj_scoreT) featsT = ({'positive': pos_scoreT, 'negative': neg_scoreT}, dataT[1]) dataLexiconFeatureT.append(featsT) classifier = NaiveBayesClassifier.train(dataLexiconFeature) realSet = collections.defaultdict(set) testSet = collections.defaultdict(set) tabPr = [] tabOut = [] for i, (feat, ovAll) in enumerate(dataLexiconFeatureT): realSet[ovAll].add(i) predicted = classifier.classify(feat) tabOut.append(predicted) tabPr.append(predicted) testSet[predicted].add(i) print("Accuracy Naive Bayes for Lexicon Model : ", nltk.classify.util.accuracy(classifier, dataLexiconFeatureT)) return realSet, testSet, tabPr, tabOut
def bow_freq(data, token): positive = readwords('positive-words.txt') negative = readwords('negative-words.txt') negative = negative[36:] positive = positive[36:] stemmer = PorterStemmer() negative = [stemmer.stem(w) for w in negative] positive = [stemmer.stem(w) for w in positive] negative = list(dict.fromkeys(negative)) positive = list(dict.fromkeys(positive)) ps = {k: v for v, k in enumerate(positive)} ns = {k: v for v, k in enumerate(negative)} sample = [] for i in range(len(data)): if ('\n' in data.iloc[i]['Summary and Review']): temp = ''.join(data.iloc[i]['Summary and Review'].split('\n')) else: temp = data.iloc[i]['Summary and Review'] temp = " . ".join(temp.split('.')).split() sample.append(temp) #negation for i in range(len(sample)): mark_negation(sample[i], shallow=True) for i in range(len(sample)): temp = [] for w in sample[i]: s = stemmer.stem(w) if ('NEG' in w): temp.append('NEG') elif (s in ps or s in ns): temp.append(w) sample[i] = ' '.join(temp) ns['NEG'] = 0 simple = [] for r in range(len(sample)): tN = 0 tP = 0 l = len(data['Summary and Review'][r]) row = sample[r].split() for c in range(len(row)): if (stemmer.stem(row[c]) in ps): tP += 1 else: tN += 1 simple.append([tP, tN, l]) return simple
def filterNegation(linein): assert type(linein) == unicode lineout = linein parts = linein.split(u' ') markednageation = mark_negation(parts) lineout = u' '.join(markednageation) return lineout
def make_unigram_feature_set(documents, min_freq=1, mark_negation=False): """ This function goes through a corpus and retains all candidate unigram features making a feature set. Optionally, it can also preprocess the corpus annotating with _NEG words that are in the scope of a negation (using NLTK helper functions). :param documents: all documents, each a list of words :param min_freq: minimum frequency of a token for it to be part of the feature set :param mark_negation: whether to preprocess the document using NLTK's nltk.sentiment.util.mark_negation see the documentation `nltk.sentiment.util.mark_negation?` :returns: unigram feature set """ counter = Counter() for doc in documents: if mark_negation == True: doc = util.mark_negation(doc) counter.update(doc) features = [] for f, n in counter.most_common(): if n >= min_freq: features.append(f) else: break return frozenset(features)
def count_emotions(self, text): # Clean up the string of characters temp0 = break_contractions(text) # Break up contractions temp1 = lemmatize_words( temp0.split()) # Split string to words, then lemmatize temp2 = mark_negation(temp1, double_neg_flip=True) # Account for negations temp3 = remove_stopwords(temp2) # Remove any stopwords # check_spelling(temp2) # Function is no longer useful # Count number of emotional associations for each valid word bank = [] wordcount = 0 for word in temp3: if word in self.associations: bank.extend(self.associations[word]) wordcount += 1 # Returns a tuple of integers for negative, positive, anger, fear, anticipation, # surprise, trust, sadness, joy, disgust, and total word count, respectively. return ((bank.count('negative'), bank.count('positive'), bank.count('anger'), bank.count('fear'), bank.count('anticipation'), bank.count('surprise'), bank.count('trust'), bank.count('sadness'), bank.count('joy'), bank.count('disgust'), wordcount))
def preprocess(s, lowercase=False, tokenizer=tokenize, word_transformation = '', handle_negation = True): """ Improve tokenization with different options :param s: sentence to tokenize :param lowercase: lowercase or not tokens :param tokenizer: which tokenizer to use :param word_transformation: stemming, lemmatize or nothing :param handle_negation: :return: tokens """ tokens = tokenizer(s) if word_transformation == 'stem': stemmer = PorterStemmer() tokens = [stemmer.stem(token) for token in tokens] elif word_transformation == 'lemmatize': lemmatizer = WordNetLemmatizer() tagged = pos_tag(tokens) tokens = [] for word, tag in tagged: wntag = get_wordnet_pos(tag) if wntag is None: lemma = lemmatizer.lemmatize(word) else: lemma = lemmatizer.lemmatize(word, pos=wntag) tokens.append(lemma) if lowercase: tokens = [token if emoticon_RE.search(token) else token.lower() for token in tokens] if handle_negation: tokens = mark_negation(tokens) return tokens
def tokenize_with_negation(text): """ Split a text into lower-case tokens, removing all punctuation tokens and stopwords :param text: input text :return: lowercase word tokens, without punctuation or stopwords """ # List of stop words in English english_stopwords = set(stopwords.words('english')) # Set of negated stopwords negated_stopwords = set(word + "_NEG" for word in english_stopwords) # List of all stopwords, including negated words all_stopwords = english_stopwords.union(negated_stopwords) tokens = [] for sent in sent_tokenize(text): pretokens = word_tokenize(sent.lower()) # exclude punctuation pretokens = [ token for token in pretokens if any(char.isalpha() for char in token) ] # exclude negated stop words (tagged as negated) pretokens = mark_negation(pretokens) tokens.extend(token for token in pretokens if token not in all_stopwords) return tokens
def clean(comment): """Return processed tokens for a given comment.""" # Split into "words" tokens = comment.split() # Remove punctuation re_punc = re.compile(f"[{re.escape(string.punctuation)}]") tokens = [re_punc.sub('', word) for word in tokens] # Remove non-alphabetic tokens tokens = [word for word in tokens if word.isalpha()] # Remove short tokens (making sure not to remove four letter words) tokens = [word for word in tokens if len(word) > 2] # Remove long tokens (possibly URLs) tokens = [word for word in tokens if len(word) < 20] # Make tokens lowercase tokens = [word.lower() for word in tokens] # Remove stop words stop_words = set(stopwords.words('english')) tokens = [word for word in tokens if word not in stop_words] # Lemmatization with POS to account for things like plurals lem = WordNetLemmatizer() tokens = [lem.lemmatize(token, lemm_pos(token)) for token in tokens] # Add negations tokens = mark_negation(tokens) # Fill with blank if a comment is totally removed by processing if len(tokens) == 0: tokens = [" "] else: pass # Remove "talk" typical at the end of certain wiki comments if tokens[-1] == 'talk': clean_comment = " ".join(tokens[:-1]) else: clean_comment = " ".join(tokens) return clean_comment
def filterNegation(linein): assert type(linein)==unicode lineout=linein parts=linein.split(u' ') markednageation=mark_negation(parts) lineout= u' '.join(markednageation) return lineout
def superTokenize(text, mark_neg, remSW, lem, stem): tokens = TweetTokenizer().tokenize(text) if mark_neg: tokens = mark_negation(tokens) if remSW: tokens = remStopWords(tokens) if lem: tokens = lemmatize(tokens) if stem: tokens = stemmize(tokens) return tokens
def make_feature_map(document, feature_set, binary=True, mark_negation=False): """ This function takes a document, possibly pre-processes it by marking words in the scope of negation, and constructs a dict indicating which features in `feature_set` fire. Features may be binary, flagging occurrence, or integer, indicating the number of occurrences. If no feature can be extracted, a special feature is fired, namely 'EMPTY()'. :param document: a list of words :param feature_set: set of features we are looking for :param binary: whether we are indicating presence or counting features in feature_set :param mark_negation: whether we should apply NLTK's mark_negation to document before applying the feature function :returns: dict with entries 'contains(f)=1/0' for binary features or 'count(f)=n' for count features """ if mark_negation: document = util.mark_negation(document) dic = defaultdict(float) for i in feature_set: if i in document: if binary: x = f"contains({i})" dic[x] = 1.0 else: x = f"count({i})" if x not in dic: dic[x] = 1.0 else: dic[x] += 1.0 if len(dic) == 0: dic["EMPTY()"] = 1.0 return dic
def clean_doc(doc): """Return processed tokens for a given document.""" # Split into "words" tokens = doc.split() # Remove punctuation re_punc = re.compile(f"[{re.escape(string.punctuation)}]") tokens = [re_punc.sub('', word) for word in tokens] # Remove non-alphabetic tokens tokens = [word for word in tokens if word.isalpha()] # Remove short tokens (making sure not to remove four letter words) tokens = [word for word in tokens if len(word) > 2] # Remove long tokens (possibly URLs) tokens = [word for word in tokens if len(word) < 20] # Make tokens lowercase tokens = [word.lower() for word in tokens] # Remove stop words stop_words = set(stopwords.words('english')) tokens = [word for word in tokens if word not in stop_words] # Lemmatization to account for things like plurals # Takes in part of speech (POS) lem = WordNetLemmatizer() tokens = [lem.lemmatize(token, lemm_pos(token)) for token in tokens] # Add negations tokens = mark_negation(tokens) # Return tokens clean_comment = " ".join(tokens) return clean_comment
def runSentanal(train, test): sentanal = SentimentAnalyzer() all_words_neg = sentanal.all_words([mark_negation(doc) for doc in train]) unigramFeats = sentanal.unigram_word_feats(all_words_neg, min_freq=4) sentanal.add_feat_extractor(extract_unigram_feats, unigrams=unigramFeats, handle_negation=True) # bigramFeats = sentanal. # sentanal.add_feat_extractor(extract_bigram_feats, bigrams=bigramFeats) trainList = sentanal.apply_features(train) testList = sentanal.apply_features(test) trainer = NaiveBayesClassifier.train classifier = sentanal.train(trainer, trainList) classifier.show_most_informative_features() # creates array for storing values values = [] # display results for key, value in sorted(sentanal.evaluate(testList).items()): print('{0}: {1}'.format(key, value)) values.append(value) # write results to csv with open(OUTPUT_CSV, mode='a') as csvFile: writer = csv.writer(csvFile, delimiter=',') writer.writerow(values)
def swn_score(text): """ Calculate score with sentiwordnet library. Return score for sentence. """ score = 0.0 if text is not None: # mark negation words = mark_negation(text.split()) # remove stopwords words = [t for t in words if t not in stopwords.words('english')] # select sense for each word words_sense = {} for word in words: clean_word = word.replace('_NEG', '') if wn.synsets(clean_word): words_sense[word] = wn.synsets(clean_word)[0] # calculate score for word, sense in words_sense.items(): pos_score = swn.senti_synset(sense.name()).pos_score() neg_score = swn.senti_synset(sense.name()).neg_score() if '_NEG' in word: pos_score, neg_score = neg_score, pos_score score += (pos_score - neg_score) if len(words_sense) != 0: score /= len(words_sense) return score
def negation_tagger(sentence): """Tags negation for list of tokens that comprises of a sentence :param list sentences: the premise or hypothesis :rtype: list :return: "_NEG" appended for tokens within negation's scope """ return mark_negation(sentence)
def addfeatures(cleaned_tokens_list): sentim_analyzer = SentimentAnalyzer() all_words_neg = sentim_analyzer.all_words( [mark_negation(token_list) for token_list in cleaned_tokens_list]) unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
def tweet_preprocessing(self, doc): """ Tweet preprocessing method """ # Handle emojis doc = emoji_parser(doc) # Handle negetion doc = re.sub(r' isnt ', r' is not ', doc).strip() doc = re.sub(r' arent ', r' are not ', doc).strip() doc = re.sub(r' aint ', r' is not ', doc).strip() doc = re.sub(r' ain ', r' is not ', doc).strip() doc = re.sub(r' wasnt ', r' was not ', doc).strip() doc = re.sub(r' wasn ', r' was not ', doc).strip() doc = re.sub(r' werent ', r' were not ', doc).strip() doc = re.sub(r' dont ', r' do not ', doc).strip() doc = re.sub(r' doesnt ', r' does not ', doc).strip() doc = re.sub(r' didnt ', r' did not ', doc).strip() doc = re.sub(r' wont ', r' will not ', doc).strip() doc = re.sub(r' won\'t ', r' will not ', doc).strip() doc = re.sub(r' havent ', r' have not ', doc).strip() doc = re.sub(r' hasnt ', r' has not ', doc).strip() doc = re.sub(r' hadnt ', r' had not ', doc).strip() doc = re.sub(r' wouldnt ', r' would not ', doc).strip() doc = re.sub(r' shouldnt ', r' should not ', doc).strip() doc = re.sub(r' shallnt ', r' shall not ', doc).strip() doc = re.sub(r' cannot ', r' can not ', doc).strip() doc = re.sub(r' cant ', r' can not ', doc).strip() doc = re.sub(r'can\'t', r' can not ', doc).strip() doc = re.sub(r' couldnt ', r' could not ', doc).strip() doc = re.sub(r'([a-zA-Z].+)n\?t', r' \1 not ', doc).strip() doc = re.sub(r'([a-zA-Z].+)n\'t', r' \1 not ', doc).strip() # capture apostrofe suffixes doc = re.sub(r'([a-zA-Z].+)\'ve', r' \1 have ', doc).strip() doc = re.sub(r'([a-zA-Z].+)\'re', r' \1 are ', doc).strip() doc = re.sub(r'([a-zA-Z].+)\'s', r' \1 \'s ', doc).strip() # capture explamation mark (!) # doc = re.sub(r'(!{2,})', '<EXCLAMATION>.', doc).strip() # capture question mark (?) # doc = re.sub(r'(\?{2,})', '<QUESTION>.', doc).strip() # remove words starting with & doc = re.sub(r' &[\w;]+ ', ' ', doc).strip() # remove numbers doc = re.sub(r'[0-9]+', '', doc).strip() # remove links doc = re.sub(r'http[s]?.+\b', '', doc).strip() # remove underscores doc = re.sub(r'_+', '', doc).strip() # remove single letters doc = re.sub(r' [a-zA-Z] ', ' ', doc).strip() # remove periods (.) doc = re.sub(r'(\.)', '', doc).strip() # Tweet tokenization with TweetTokenizer module tk = TweetTokenizer(strip_handles=True, reduce_len=True) tokens = tk.tokenize(doc) if self.handle_negation: tokens = mark_negation(tokens) return tokens
def clean_and_tokenize_review(review_text: str) -> List[str]: """ Uses wordpunct_tokenize to keep punctuation groups together # https://www.nltk.org/api/nltk.tokenize.html#nltk.tokenize.regexp.WordPunctTokenizer Removes nltk stopwords from token list """ tokens: List[str] = wordpunct_tokenize(review_text) mark_negation(tokens, shallow=True) # transform to normalize words ret = [] for token in tokens: split_token = token.split("_") split_token[0] = split_token[0].lower() if split_token[0] in stopwords: continue ret.append("_".join(split_token)) return ret
def score_review(review): sentiment_scores = [] sents = sent_tokenize(review) for sent in sents: wds = word_tokenize(sent) wds = mark_negation(wds) sent_scores = score_sent(wds) sentiment_scores.append(sent_scores) return sum(sentiment_scores) / len(sentiment_scores)
def tokenize_with_negation(text): # split text into lower-case tokens, removing all-punctuation tokens and stopwords tokens = [] for sentence in sent_tokenize(text): pretokens = word_tokenize(sentence.lower()) pretokens = [x for x in pretokens if any(i.isalpha() for i in x)] pretokens = mark_negation(pretokens) tokens.extend(x for x in pretokens if x not in all_stopwords) return tokens
def mark_neg(list_of_lists_of_tokens, double_neg_flip=False): """ Return count of words in each text Parameters ---------- - list_of_lists_of_tokens : dataframe column whose cells contain lists of word-tokenised sentences - OUTPUT : """ return [mark_negation(sent) for sent in list_of_lists_of_tokens]
def stemming_tokenizer(sentence): sentence = word_tokenize(sentence) stop_words = set(stopwords.words('english')) newsentence = [] for word in sentence: if word not in stop_words: if len(word) >= 3: newsentence.append(word) newsentence = mark_negation(newsentence) return newsentence
def mark_neg(list_of_lists_of_tokens, double_neg_flip=False): """ Mark negations, i.e., append _NEG suffix to words that appear in the scope between a negation and a punctuation mark. Parameters ---------- - list_of_lists_of_tokens : dataframe column whose cells contain lists of word-tokenised sentences - OUTPUT : """ return [mark_negation(sent) for sent in list_of_lists_of_tokens]
def tokenize_with_negations(text: str) -> list: # marking negative words and finally eliminating remaining punctuation # what is important - this method should be used as last. # it returns text tokenized # Surrounding stop mark with spaces is also important to get rid of # problem with # stop mark connected with the following word. text = mark_negation([ x for x in word_tokenize(re.sub(pattern=r"\.", repl=" . ", string=text)) ]) text = [x for x in text if x not in ['.', ':', ';', '!', '?', 'no', 'not']] return text
def negation_check(sentence, set_terms): words = word_tokenize(sentence) negations = mark_negation(words) only_neg = list(set(negations).difference(words)) if (len(only_neg) == 0): return set_terms only_neg = [x[:-4] for x in only_neg] terms = list(set_terms) for i, term in enumerate(terms): for t in term.split(): if t in only_neg: terms[i] = term + '_NEG' break return set(terms)
def feature_set(post_list): """ Expects a list of cleaned posts in sentence format and returns a featureset calculated by marking negation then doing a count vectorization and tf-idf transform """ # mark negation # count vectorizer # tf-idf # isn't f****d up -> isn't fucked_NEG up_NEG marked = [mark_negation(p) for p in post_list] tv = TfidfVectorizer(min_df=1) marked_words = flatten(as_words(marked)) return tv.fit_transform(marked_words)
def one_vector_senti(self, sentence): sentence = ' '.join(mark_negation(sentence.split())) sentence = str(sentence).lower() global_vec = [] global_vec.extend(self.Vader_API(sentence)) global_vec.extend([self.senti_Strength.score(sentence)]) global_vec.extend([self.afn.score(sentence)]) global_vec.extend(self.huliu.score(sentence)) global_vec.extend([self.senti_wordNet.score(sentence)]) global_vec.extend(self.effect_WN.score(sentence)) global_vec.extend(self.sentic_net.score(sentence)) global_vec.extend(self.subj_cue_senti.score(sentence)) global_vec.extend(self.emo_lex_senti.score(sentence)) return global_vec
def filterData(data): positive = readwords('positive-words.txt') negative = readwords('negative-words.txt') negative = negative[36:] positive = positive[36:] stemmer = PorterStemmer() negative = [stemmer.stem(w) for w in negative] positive = [stemmer.stem(w) for w in positive] negative = list(dict.fromkeys(negative)) positive = list(dict.fromkeys(positive)) ps = {k: v for v, k in enumerate(positive)} ns = {k: v for v, k in enumerate(negative)} sample = [] for i in range(len(data)): if ('\n' in data.iloc[i]['Summary and Review']): temp = ''.join(data.iloc[i]['Summary and Review'].split('\n')) else: temp = data.iloc[i]['Summary and Review'] temp = " . ".join(temp.split('.')).split() sample.append(temp) #negation for i in range(len(sample)): mark_negation(sample[i], shallow=True) for i in range(len(sample)): temp = [] for w in sample[i]: s = stemmer.stem(w) if ('NEG' in w): temp.append('NEG') elif (s in ps or s in ns): temp.append(w) sample[i] = ' '.join(temp) data['Summary and Review'] = sample return data
def process_document(cls, document): document = cls.html_processing(document) #tokenize words = cls.t_word_tokenizer.tokenize(document) #print(" \n Tokenizing: {} \n".format(words)) #expand contractions words = cls.expand_contractions(words) #print("Expanding contractions: {} \n".format(words)) # to lowercase words = list(map(str.lower, words)) tagged_sentence = pos_tag(words) proper_nouns_tags = ['IN', 'NNP', 'PRP', 'PRP$', 'WP$'] tagged_sentence = [(word, tag) for word, tag in tagged_sentence if tag not in proper_nouns_tags] #print("Filtering tags: {} \n".format(tagged_sentence)) words = [] for word, tag in tagged_sentence: wordnet_tag = cls.find_wordnet_tag(tag) if wordnet_tag != '': word = cls.remove_apos(word) words.append( cls.lemmatizer.lemmatize(word.lower(), wordnet_tag)) elif word in string.punctuation: words.append(word) #print("Lemmatize: {} \n".format(words)) # must be reviewed words = [ word for word in words if word not in string.punctuation and len(word) > 1 and cls.is_english_word(word.lower()) ] #print("Punctuation and english: {} \n".format(words)) words = mark_negation(words) #print("Negation: {} \n".format(words)) stop_wrods = set(cls.english_stopwords + cls.my_stopwords) words = [word for word in words if word.lower() not in stop_wrods] #print("Stop words: {} \n".format(words)) return words
def process(tweet): global i print(i) i += 1 text = tweet['text'].lower() lang = tweet['lang'] try: mentions = tweet['mentions'] urls = tweet['urls'] except KeyError: mentions = [] urls = [] # remove mentions and urls from tweets text = filter_tweet(text, mentions, urls) # seperate emojis and text [text, emoji_text] = separate_emojis(text) # remove English contractions if lang == 'en': text = expand_contractions(text) if lang == 'ar': text = remove_arabic_variants(text) #remove stop words text = remove_stopwords(text, lang) # mark negations and remove punctuation from text if lang == 'en': text = mark_negation(text) text = remove_punct(text) # normalize text by removing repetions and steming # old_text = text text = normalize_repititions(text, lang) if lang == 'ar': text = farasa.lemmatize(text) if lang == 'en': text = stem_words(text) # if(old_text != text): # print('old text: %s, new text: %s' %(tweet['text'], ' '.join(text))) text += emoji_text print(text) return text
def __call__(self, t): t = self.reduce_lengthening(t) tokens = t.split(' ') cleaned_tokens = [] for token in tokens: token = self.replace_username(token) token = self.replace_link(token) cleaned_tokens.append(token) rebuild_str = ' '.join(cleaned_tokens) negated_tokens = mark_negation(list(self.tknzr.tokenize(rebuild_str))) list_of_trigrams = list([' '.join(s) for s in trigrams(negated_tokens)]) return list_of_trigrams
def sent_tokenize(x): # have trouble with double negation, input a df stopword = set(stopwords.words('english')) - {'he', 'him', 'his', 'himself', 'she', 'her', "she's", 'her', 'hers', 'herself', 'they', 'them', 'their', 'theirs', 'themselves'} lmtzer = WordNetLemmatizer() x = x.lower() temp = re.sub("\,",'.',x) word = re.findall('[a-zA-Z]+|:\)|\.\.\.+|[!]+|\!\?|\.',temp) word = mark_negation(word) word = [i for i in word if i not in stopword] word_tag = nltk.pos_tag(word) lmt_word = [lmtzer.lemmatize(i_pair[0], pos=wordnet_pos(i_pair[1])) for i_pair in word_tag] return lmt_word
def f(s): return mark_negation(tokenizer.tokenize(s))
def __call__(self, t): text = self.reduce_lengthening(t) tokens = list(self.tknzr.tokenize(text)) negated_tokens = mark_negation(tokens) list_of_skipgrams = list(skipgrams(negated_tokens, self.n, self.k)) return list([' '.join(s) for s in list_of_skipgrams])