def get_words_list(dataset): ''' Loading dataset and read contents, use tokenize to get tokens and lemmatize the words. ''' # join the path and file name together spam_path = 'data/enron/pre/'+ dataset + '/spam/' ham_path = 'data/enron/pre/'+ dataset + '/ham/' spam_npl = [i[-1] for i in os.walk(spam_path)][0] ham_npl = [i[-1] for i in os.walk(ham_path)][0] spam_fl = (open(os.path.join(spam_path, j)).read().lower() for j in spam_npl) ham_fl = (open(os.path.join(ham_path, j)).read().lower() for j in ham_npl) splitter = re.compile("\\W*") english_stops = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() # tokenize the files into words spam_wl = [None]*len(spam_npl) for i,f in enumerate(spam_fl): spam_wl[i] = [word for word in (lemmatizer.lemmatize(w) for w in splitter.split(f) \ if w not in english_stops and w.isalpha()) if len(word) > 2 and len(word) < 20] ham_wl = [None]*len(ham_npl) for i,f in enumerate(ham_fl): ham_wl[i] = [word for word in (lemmatizer.lemmatize(w) for w in splitter.split(f) \ if w not in english_stops and w.isalpha()) if len(word) > 2 and len(word) < 20] return spam_wl, ham_wl
def tokenize3(text): wordnet_lemmatizer = WordNetLemmatizer() tokens = word_tokenize(text) tokens = [wordnet_lemmatizer.lemmatize(token, NOUN) for token in tokens] tokens = [wordnet_lemmatizer.lemmatize(token, VERB) for token in tokens] tokens = [wordnet_lemmatizer.lemmatize(token, ADJ) for token in tokens] return tokens
def get_clean_text(list_filenames, path_to_file): ''' parameter: ---------- list_filenames: as LST is a list of filename as STR path_to_file: as STR is the path to the file containing movie scripts --> such that path_to_file/filename.txt is the file to open returns: -------- list of list of words (lemmatize, lowercase) in the text (order preserved) ''' wnl = WordNetLemmatizer() list_texts_as_words = [] for filename in list_filenames: path_file = path_to_file+"/"+filename+".txt" with open(path_file) as f: text = f.readlines() lines = [line.strip() for line in text if line.strip()] string_words = [] for line in lines: words = [wnl.lemmatize(word.lower()) for word in line.split(' ') if wnl.lemmatize(word.lower())] string_words += words list_texts_as_words.append(string_words) return list_texts_as_words
def bow_score(hypothesis_list,text_list): wordnet_lemmatizer = WordNetLemmatizer() stop_word_list = ['a', 'an', 'the', ',', '.', ';', ':' ] i = 0 while i < len(hypothesis_list): if hypothesis_list[i] in stop_word_list: del hypothesis_list[i] i = i - 1 i = i + 1 if len(hypothesis_list) == 0: return 0 i = 0 while i < len(text_list): if text_list[i] in stop_word_list: del text_list[i] i = i - 1 i = i + 1 if len(text_list) == 0: return 0 ## Stop words removed up until here score = 0 for word_text in text_list: lemma_text = wordnet_lemmatizer.lemmatize(word_text) for word_hypothesis in hypothesis_list: lemma_hypothesis = wordnet_lemmatizer.lemmatize(word_hypothesis) print lemma_hypothesis print lemma_text score += lexical_compare(lemma_text,lemma_hypothesis) print str(score) return score
def negator(self,wordVec): negation = False negated_doc = [] lemmatizer = WordNetLemmatizer() for w,p in wordVec: w_out = "" if (p[:2] == "NN"): w_out = lemmatizer.lemmatize(w.lower(), pos=wordnet.NOUN) elif (p[:2] == "JJ"): w_out = lemmatizer.lemmatize(w.lower(), pos=wordnet.ADJ) elif (p[:2] == "VB"): w_out = lemmatizer.lemmatize(w.lower(), pos=wordnet.VERB) elif (p[:2] == "RB"): w_out = lemmatizer.lemmatize(w.lower(), pos=wordnet.ADV) if(w_out == "not" or w_out == "n't" ): #print "blah" negation = not negation #rint negation elif(w_out in string.punctuation and w_out != ''): negation = False elif(negation): #print negation w_out = "NOT_"+w_out negated_doc.append((w_out,p)) #print negated_doc return negated_doc
def createCorpus(data,i, binaryX="False", stopWords=None, lemmatize="False", tfidf= "False", useidf="True"): # will vectorize BOG using frequency as the parameter and will return the required arrays X_train =[] X_test=[] Y_train=[] Y_test=[] for key in data: if key in i: for filename in data[key]: text = data[key][filename][0] if lemmatize == "True": port = WordNetLemmatizer() text = " ".join([port.lemmatize(k,"v") for k in text.split()]) X_test.append(text) Y_test.append(data[key][filename][1]) else: for filename in data[key]: text = data[key][filename][0] if lemmatize == "True": port = WordNetLemmatizer() text = " ".join([port.lemmatize(k,"v") for k in text.split()]) X_train.append(text) Y_train.append(data[key][filename][1]) if tfidf == "False": vectorizer = CountVectorizer(min_df=1, binary= binaryX, stop_words=stopWords) X_train_ans = vectorizer.fit_transform(X_train) X_test_ans = vectorizer.transform(X_test) return X_train_ans, Y_train, X_test_ans,Y_test elif tfidf == "True": vectorizer = TfidfVectorizer(min_df=1, use_idf=useidf) X_train_ans = vectorizer.fit_transform(X_train) X_test_ans = vectorizer.transform(X_test) return X_train_ans, Y_train, X_test_ans,Y_test
def getBoW(self, instance): bowFeatures = {} # tokens in the third position tokens = instance[3] # pos tag wordnet_lemmatizer = WordNetLemmatizer() tagged = nltk.pos_tag(tokens) i = 0 for tag in tagged: if instance[2] == i: i +=1 continue #sys.stderr.write('remove target word (%s)\n' % tag[0]) elif tag[0] in stopwords.words("english"): i +=1 continue #sys.stderr.write('stopword (%s)\n' % tag[0]) elif re.match("N.*", tag[1]): bowFeatures['bow(%s)' % wordnet_lemmatizer.lemmatize(tag[0], pos="n")] = True elif re.match("V.*", tag[1]): bowFeatures['bow(%s)' % wordnet_lemmatizer.lemmatize(tag[0], pos="v")] = True elif re.match("R.*", tag[1]): bowFeatures['bow(%s)' % wordnet_lemmatizer.lemmatize(tag[0], pos="r")] = True elif re.match("J.*", tag[1]): bowFeatures['bow(%s)' % wordnet_lemmatizer.lemmatize(tag[0], pos="a")] = True i += 1 return bowFeatures
class TweetsLemmatizedVectorizer(TweetsTruncatedVectorizer): def __init__(self): self.vectorizer = TfidfVectorizer(stop_words='english',min_df=5) #, sublinear_tf=True) self.wordnet = WordNetLemmatizer() def fit_transform(self, users): join_tweets = [] for user in users: timeline = [''.join(remove_tweet_noise(tweet.text)) for tweet in user.twitter] #timeline_insta = [''.join(remove_tweet_noise(insta.text)) for insta in user.instagram] #print timeline_insta #timeline = timeline + timeline_insta lemmatized = [] for tweet in timeline: lemma = [self.wordnet.lemmatize(word) for word in tweet.split()] lemmatized.append(' '.join(lemma)) join_tweets.append(''.join(lemmatized)) return self.vectorizer.fit_transform([usertweets for usertweets in join_tweets]) def transform(self, users): join_tweets = [] for user in users: timeline = [''.join(remove_tweet_noise(tweet.text)) for tweet in user.twitter] lemmatized = [] for tweet in timeline: lemma = [self.wordnet.lemmatize(word) for word in tweet.split()] lemmatized.append(' '.join(lemma)) join_tweets.append(''.join(lemmatized)) return self.vectorizer.transform([usertweets for usertweets in join_tweets])
def possibility(): wnl = WordNetLemmatizer() verb = wnl.lemmatize(verbs[random.randrange(0, len(verbs))]) noun = wnl.lemmatize(nouns[random.randrange(0, len(nouns))]) article = "a" if noun[0] in ["a", "e", "i", "o", "u"]: article = "an" if random.randrange(0, 100) < chance_quantity: quantity_word = quantity_adverbs[random.randrange(0, len(quantity_adverbs))] if not noun.endswith("s") and not noun.endswith("y") and not quantity_word == "numerous": noun += "s" possibility = verb + " " + quantity_word + " of the " + noun elif random.randrange(0, 100) < chance_location: location_word = location_adverbs[random.randrange(0, len(location_adverbs))] possibility = ( verb + " " + article + " " + noun + " " + location_word + " the " + wnl.lemmatize(nouns[random.randrange(0, len(nouns))]) ) else: possibility = verb + " " + article + " " + noun return possibility
class LemmaTokenizer(object): def __init__(self): self.wnl = WordNetLemmatizer() def __call__(self, doc): doc = doc.lower() doc = re.sub("[^a-z]", " ", doc) #replace punctuation with spaces # doc = re.sub("thanks", "thank", doc) return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if len(self.wnl.lemmatize(t)) > 2]
def build_analyzer(self): try: english_lemmatizer = WordNetLemmatizer() analyzer = super(ProcessCountVectorizer, self).build_analyzer() return lambda doc: (english_lemmatizer.lemmatize(english_lemmatizer.lemmatize(w, "v"), "n") for w in analyzer(doc) if not w.endswith("ly") and len(w) > 4) except Warning: pass
def pos_analysis(tags, stoplist): wordnet_lemmatizer = WordNetLemmatizer() nouns = [wordnet_lemmatizer.lemmatize(word) for word, tag in tags if tag=='NN'] display_freq(nouns, 'Nouns', top=50) adjectives = [wordnet_lemmatizer.lemmatize(word) for word, tag in tags if tag=='JJ'] display_freq(adjectives, 'Adjectives', top=50) verbs = [wordnet_lemmatizer.lemmatize(word, pos='v') for word, tag in tags if tag[:2] in ('VB') and word not in stoplist] display_freq(verbs, 'Verbs', top=50)
def stopWordRemoval() : f = open('repos', 'r') strn = f.read() lst = strn.split('\n') i = 0 while i < (len(lst) - 1) : name = lst[i].split("/") dummyFile = 'filteredData/' + name[1] + '/dummy.txt'; dr = os.path.dirname(dummyFile) if not os.path.exists(dr) : os.makedirs(dr) ft = open('data/'+name[1]+'/title.txt') st = ft.read().lower() fd = open('data/'+name[1]+'/description.txt') sd = fd.read().lower() fc = open('data/'+name[1]+'/content.txt') sc = fc.read().lower() tokenizer = RegexpTokenizer(r'\w+') wordArrTitle = tokenizer.tokenize(st) wordArrDesc = tokenizer.tokenize(sd) wordArrData = tokenizer.tokenize(sc) filteredWordsTitle = [w for w in wordArrTitle if not w in stopwords.words('english')] filteredWordsDesc = [w for w in wordArrDesc if not w in stopwords.words('english')] filteredWordsData = [w for w in wordArrData if not w in stopwords.words('english')] wordnet_lem= WordNetLemmatizer() ftf = open('filteredData/'+name[1]+'/title.lst','w') for w in filteredWordsTitle: #print w ftf.write(wordnet_lem.lemmatize(w)+'\n') fdf = open('filteredData/'+name[1]+'/description.lst','w') for w in filteredWordsDesc: #print w fdf.write(wordnet_lem.lemmatize(w)+'\n') fcf = open('filteredData/'+name[1]+'/content.lst','w') for w in filteredWordsData: print w+'\n' fcf.write(wordnet_lem.lemmatize(w)+'\n') i=i+2
def tokenize4(text): wordnet_lemmatizer = WordNetLemmatizer() tokens = word_tokenize(text) wordset = set(words.words()) tokens = [wordnet_lemmatizer.lemmatize(token, NOUN) for token in tokens] tokens = [wordnet_lemmatizer.lemmatize(token, VERB) for token in tokens] tokens = [wordnet_lemmatizer.lemmatize(token, ADJ) for token in tokens] tokens = [token for token in tokens if token in wordset] return tokens
def __init__(self, data, label=None, *args, **kwargs): lem = WordNetLemmatizer() if data and not label: # Data is assumed to be NLTK-style (word, tag) pairs. # If you'd like to collapse the tag set, this is the place. label = [re.sub(r'[{}]+'.format(punctuation),'PUN',tag) for word, tag in data] # e.g., tag[0] data = [re.sub(r'[{}]+'.format(punctuation),'PUN', lem.lemmatize(word.lower())) for word, tag in data] data = [re.sub(r'[0-9]+','NUM', lem.lemmatize(word.lower())) for word in data] super(TaggedSentence, self).__init__(data, label, *args, **kwargs)
def preprocessing(text): lemmatizer = WordNetLemmatizer() worddict = set(nltk.corpus.words.words()) text = text.lower() words = text.strip().decode('utf-8') wordset_n = set(lemmatizer.lemmatize(w, NOUN) for w in word_tokenize(words)) wordset_v = set(lemmatizer.lemmatize(w, VERB) for w in wordset_n) wordset = set(lemmatizer.lemmatize(w, ADJ) for w in wordset_v) wordset = wordset & worddict return ' '.join(list(wordset))
def lemmatize(word, mode): try: wnl = WordNetLemmatizer() if mode=='n' : return wnl.lemmatize(word, 'n') elif mode=='v' : return wnl.lemmatize(word, 'v') return word except: print "WordNet Lemmatizer failed." return ''
def read_data(): ''' This function reads the data from deals.txt and performs all pre-prosessing. It removes punctuations, stop words and lematizes the words. Also null lines in the file are removed''' good_deals_file = os.path.join(data_dir,"good_deals.txt") bad_deals_file = os.path.join(data_dir,"bad_deals.txt") stop_words_file = os.path.join(data_dir,"stop_words.txt") test_deals_file = os.path.join(data_dir,"test_deals.txt") f_stop_words = open(stop_words_file,'r') stop_words = [word.strip() for word in f_stop_words] stop_words.append("com") stop_words = set(stop_words) wnl = WordNetLemmatizer() start_time = time.time() f = open(good_deals_file,'r') good_deals = [] for line in f: deal = re.findall(r"[\w']+|[!?;]%", line) '''removes stop words''' deal = " ".join(word for word in deal if word not in stop_words) deal = " ".join((wnl.lemmatize(word)) for word in deal.split()) good_deals.append(deal) f.close() f = open(bad_deals_file,'r') bad_deals = [] for line in f: deal = re.findall(r"[\w']+|[%!?;]", line) '''removes stop words''' deal = " ".join(word for word in deal if word not in stop_words) deal = " ".join((wnl.lemmatize(word)) for word in deal.split()) bad_deals.append(deal) f.close() f = open(test_deals_file,'r') test_deals = [] for line in f: deal = re.findall(r"[\w']+|[!?;]%", line) '''removes stop words''' deal = " ".join(word for word in deal if word not in stop_words) deal = " ".join((wnl.lemmatize(word)) for word in deal.split()) test_deals.append(deal) f.close() return [good_deals,bad_deals,test_deals]
def stem_wordnet(self, word): wnl = WordNetLemmatizer() # obtain the word class tag = nltk.pos_tag(nltk.word_tokenize(word)) # word class for verb can be different, but the first two letters must be "VB" if len(tag[0][1]) >= 2 and (tag[0][1])[0:2] == 'VB': return wnl.lemmatize(word, 'v') else: return wnl.lemmatize(word)
def find_replacements(sentence, lwindow, rwindow, add=False): """ This function would be used to find replacements for the word present inside the sentence. @sentence: Actual sentence in which word is present. @lwindow : Number of context words in the left of the replacement. @rwindow : Number of context words in the right of the replacement. @add : Whether we are going to add the vectors. Otherwise default to multiply. """ # Remove the START and END temporarily and tag the data. word = sentence[sentence.index('_START_') + 7 : sentence.index('_END_')] word_index = nltk.word_tokenize(sentence).index("_START_" + word + "_END_") t_sentence = sentence[:sentence.index('_START_')] + word + sentence[sentence.index('_END_') + 5:] # Tag the sentence and then bring the START and END back. tagged_sentence = nltk.pos_tag(nltk.word_tokenize(t_sentence)) #print sentence, tagged_sentence wnl = WordNetLemmatizer() word_postag = get_wordnet_pos(tagged_sentence[word_index][1]) if word_postag: word = wnl.lemmatize(word, pos=word_postag) tagged_sentence[word_index] = ["_START_" + word + "_END_", tagged_sentence[word_index][1]] # Remove all the words, whose tags are not important and also # get rid of smaller words. imp_words = filter(lambda x: len(x[0]) > 2, get_imp_words(tagged_sentence)) #print imp_words final_list = [] for i, x in enumerate(imp_words): if x[0].startswith("_START_"): index = i x[0] = x[0][7:x[0].index("_END_")] final_list.append("_START_" + x[0].lower() + "_" + x[1][0].lower() + "_END_") word = word.lower() #+ "_" + x[1][0].lower() #print word else: # Lemmatize all the words. word_postag = get_wordnet_pos(x[1]) temp = x[0] if word_postag: temp = wnl.lemmatize(x[0], pos=word_postag) final_list.append(temp.lower()) # + "_" + x[1][0].lower()) try: return find_replacements_helper(final_list, word, index, int(lwindow), int(rwindow) + 1, add) except Exception: return "NONE"
def events_filter(title, lang): #Cleans, tokenizes and lemmatizes news title to save keyowrds. #This way, words are saved in their dictionary form. #With this we have a standard way of representing an event. f_tags = [] #Regex adapted from nltk documentation pattern = ( r"(?x)" # set flag to allow verbose regexps r"(?:[A-Z])(?:\.[A-Z])+\.?" # abbreviations, e.g. U.S.A. r"|\w+(?:-\w+)*" # words with optional internal hyphens r"|\$?\d+(?:\.\d+)?%?" # currency and percentages, e.g. $12.40, 82% ) #Tokenize title acording to the regex pattern. tokens = nltk.regexp_tokenize(title, pattern) #Remove stopwords. Lang should be either 'english' or 'spanish'. tokens = [w.lower() for w in tokens if w.lower() not in sw.words(lang)] if lang == "english": #Lemmatization for english. wnl = WordNetLemmatizer() #Tag words (noun, adjective, verb or adverb). Makes lemmatization more accurate. pos_toks = nltk.pos_tag(tokens) #Transform pos_tag in tag that lemmatize understand. wordnet_tag = { 'NN':'n', 'NNS':'n', 'NNP':'n', 'NNPS':'n', 'JJ':'a', 'JJR':'a', 'JJS':'a', 'VB':'v', 'VBD':'v', 'VBG':'v', 'VBN':'v', 'VBP':'v', 'VBZ':'v', 'RB':'r', 'RBR':'r', 'RBS':'r'} #Lemmatization, with pos tags. for i in range(len(tokens)): pos_tok = pos_toks[i] if pos_tok[1] in wordnet_tag.keys(): tokens[i] = wnl.lemmatize(tokens[i], wordnet_tag[pos_tok[1]]) else: tokens[i] = wnl.lemmatize(tokens[i]) elif lang == "spanish": #Lemmatization for spanish, using a dictionary. for i in range(len(tokens)): if tokens[i] in SP_LEMMAS.keys(): tokens[i] = SP_LEMMAS[tokens[i]] #else: word not in dictionary, save token unchanged. for tok in tokens: f_tags.append(clean_word(tok)) return f_tags
def __wn_lemmatize(self, lemma): """ Lemmatize lemma using wordnet.stemWordNetLemmatizer(). Always returns a (string, pos) pair. Lemmatizes even when the tag isn't helpful, by ignoring it for stemming. """ string, tag = lemma wnl = WordNetLemmatizer() if tag in ('a', 'n', 'r', 'v'): string = wnl.lemmatize(string, tag) else: string = wnl.lemmatize(string) return (string, tag)
def __lemmatizeTuples(self, tuples): lmt = WordNetLemmatizer(); # Multiple tuples if isinstance(tuples, list): lemmaTuples = [(' '.join([lmt.lemmatize(c.lower(), 'v') for c in j[0].split(' ')]), \ ' '.join([lmt.lemmatize(c.lower(), 'n') for c in j[1].split(' ')]), \ ' '.join([lmt.lemmatize(c.lower(), 'n') for c in j[2].split(' ')])) \ for j in tuples]; else: lemmaTuples = (' '.join([lmt.lemmatize(c.lower(), 'v') for c in tuples[0].split(' ')]), \ ' '.join([lmt.lemmatize(c.lower(), 'n') for c in tuples[1].split(' ')]), \ ' '.join([lmt.lemmatize(c.lower(), 'n') for c in tuples[2].split(' ')])); return lemmaTuples
def dogify(inp): wnl = WordNetLemmatizer() l = nltk.word_tokenize(inp) l1, l2, l3 = [], [], [] for i, j, in nltk.pos_tag(l): if len(i) < 4: continue if j == "NN": l1.append(i) elif j == "JJ": l2.append(i) elif j.find("VB") != -1: l3.append(i) def rnd(x): return random.randint(0, x-1) def go(l): l.sort() ret = [""] cnt = 0 bst = 0 prv = "" for i in l: if i != prv: cnt = 0 cnt += 1 if cnt > bst: bst = cnt ret = [] if cnt == bst: ret.append(i) prv = i x = rnd(len(ret)) return ret[x] noun = wnl.lemmatize(go(l1)) adj = wnl.lemmatize(go(l2)) verb = wnl.lemmatize(go(l3)) s = "" if len(noun): s += "so " + noun + "\n" if len(adj): s += "much " + adj + "\n" if len(verb): s += "very " + verb + "\n" s += "wow" return s
def lemmatize_text_as_list(self, text_alpha_no_punct_stopword_list): lemmatizer = WordNetLemmatizer() lemmatized_list_by_verb = [] lemmatized_list_by_verb_noun = [] lemmatized_list_by_verb_noun_adj = [] lemmatized_list_by_verb_noun_adj_adv = [] for i in text_alpha_no_punct_stopword_list: lemmatized_list_by_verb.append(lemmatizer.lemmatize(i, pos='v')) for i in lemmatized_list_by_verb: lemmatized_list_by_verb_noun.append(lemmatizer.lemmatize(i, pos='n')) for i in lemmatized_list_by_verb_noun: lemmatized_list_by_verb_noun_adj.append(lemmatizer.lemmatize(i, pos='a')) for i in lemmatized_list_by_verb_noun_adj: lemmatized_list_by_verb_noun_adj_adv.append(lemmatizer.lemmatize(i, pos='r')) return lemmatized_list_by_verb_noun_adj_adv
def tokenize5(text): wordnet_lemmatizer = WordNetLemmatizer() translate_table = dict((ord(char), None) for char in string.punctuation) if type(text) == str: tokens = word_tokenize(text.translate(None, string.punctuation)) # remove punctuation tokens = [wordnet_lemmatizer.lemmatize(token, NOUN) for token in tokens] tokens = [wordnet_lemmatizer.lemmatize(token, VERB) for token in tokens] tokens = [wordnet_lemmatizer.lemmatize(token, ADJ) for token in tokens] return tokens elif type(text) == unicode: tokens = word_tokenize(text.translate(translate_table)) tokens = [wordnet_lemmatizer.lemmatize(token, NOUN) for token in tokens] tokens = [wordnet_lemmatizer.lemmatize(token, VERB) for token in tokens] tokens = [wordnet_lemmatizer.lemmatize(token, ADJ) for token in tokens] return tokens
def generate_captions_and_comments(): with open('./data/big_data_approx.json') as json_file: video_data = json.load(json_file) video_num_comments, video_captions = np.array([ (video_datum["score"], video_datum["captions"]) for _,video_datum in video_data.iteritems() ]).T # Define a stemmer and lemmatizer for use with our captions stemmer = PorterStemmer() lemmatizer = WordNetLemmatizer() combined_video_captions = [] video_num_comments_cut = [] for caption_data_list,num_comments in zip(video_captions,video_num_comments): text = "" if caption_data_list is not None: video_num_comments_cut.append(num_comments) for caption_data in caption_data_list: if caption_data is not None and "text" in caption_data: for word in caption_data["text"].split(): #text += (stemmer.stem(word)+" ") text += (lemmatizer.lemmatize(word)+" ") combined_video_captions.append(text[:-1]) video_captions = combined_video_captions return (video_num_comments_cut, video_captions)
def stemming(words_l, type="PorterStemmer", lang="english", encoding="utf8"): supported_stemmers = [ "PorterStemmer", "SnowballStemmer", "LancasterStemmer", "WordNetLemmatizer"] if type is False or type not in supported_stemmers: return words_l else: l = [] if type == "PorterStemmer": stemmer = PorterStemmer() for word in words_l: l.append(stemmer.stem(word).encode(encoding)) if type == "SnowballStemmer": stemmer = SnowballStemmer(lang) for word in words_l: l.append(stemmer.stem(word).encode(encoding)) if type == "LancasterStemmer": stemmer = LancasterStemmer() for word in words_l: l.append(stemmer.stem(word).encode(encoding)) if type == "WordNetLemmatizer": # TODO: context wnl = WordNetLemmatizer() for word in words_l: l.append(wnl.lemmatize(word).encode(encoding)) return l
class Mapper(object): def __init__(self): if 'stopwords' in self.params: with open(self.params['stopwords'], 'r') as excludes: self._stopwords = set(line.strip() for line in excludes) else: self._stopwords = None self.lemmatizer = WordNetLemmatizer() def __call__(self, key, value): for word in self.tokenize(value): if not word in self.stopwords: yield word, 1 def normalize(self, word): word = word.lower() return self.lemmatizer.lemmatize(word) def tokenize(self, sentence): for word in wordpunct_tokenize(sentence): yield self.normalize(word) @property def stopwords(self): if not self._stopwords: self._stopwords = nltk.corpus.stopwords.words('english') return self._stopwords
def search_posts(phrase, engine): lemmatizer = WordNetLemmatizer() words = ["(^|[^a-z])" + lemmatizer.lemmatize(word) for word in word_tokenize(phrase) if word not in stopwords.words('english') and len(word) >= 3] if len(words) == 0: return None params = {'phrase': "|".join(words)} query = ["SELECT link_id, url, title FROM threads", "WHERE title_lower ~ %(phrase)s"] found = pd.read_sql(" ".join(query), engine, params=params) if len(found['link_id']) == 0: return None link_ids = ', '.join(found['link_id'].apply(lambda lid: "'" + lid + "'")) query = ["SELECT clean_body as body, affil, link_id FROM cleaned", "WHERE link_id IN (" + link_ids + ")"] data = pd.read_sql(" ".join(query), engine) valid = data[data['body'].apply(lambda text: len(text.split()) >= 10 and not bool(re.search("[^a-z]bot[^a-z]", text)))] if valid.shape[0] < 60: return None return valid, found.set_index('link_id')
import nltk nltk.download('punkt') from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() sentence = "He was running and eating at same time. He has bad habit of swimming after playing long hours in the Sun." punctuations = "?:!.,;" sentence_words = nltk.word_tokenize(sentence) for word in sentence_words: if word in punctuations: sentence_words.remove(word) sentence_words print("{0:20}{1:20}".format("Word", "Lemma")) for word in sentence_words: # print ("{0:20}{1:20}".format(word,wordnet_lemmatizer.lemmatize(word))) ''' In the above output, you must be wondering that no actual root form has been given for any word, this is because they are given without context. You need to provide the context in which you want to lemmatize that is the parts-of-speech (POS). This is done by giving the value for pos parameter in wordnet_lemmatizer.lemmatize ''' print("{0:20}{1:20}".format(word, wordnet_lemmatizer.lemmatize(word, pos='v'))) print(wordnet_lemmatizer.lemmatize("was", pos='v'))
class Preprocess: def __init__(self, text): self.text = text self.STOPWORDS = set(stopwords.words('english')) self.spell = SpellChecker() self.p = inflect.engine() self.nlp = en_core_web_sm.load() #self.nlp = spacy.load('en_core_web_md') self.model = api.load("glove-twitter-25") self.lemmatizer = WordNetLemmatizer() self.stemmer = PorterStemmer() def strip_html_tags(self): """remove html tags from text""" soup = BeautifulSoup(self.text, "html.parser") stripped_text = soup.get_text(separator=" ") return stripped_text def remove_accented_chars(self): """remove accented characters from text, e.g. café""" text = unidecode.unidecode(self.text) return text '''def expand_contractions(self, text): """expand shortened words, e.g. don't to do not""" text = list(cont.expand_texts([text], precise=True))[0] return text''' def pos_tagging(self): word_tokens = word_tokenize(self.text) return pos_tag(word_tokens) def text_lowercase(self): return self.text.lower() def text_uppercase(self): return self.text.upper() def remove_numbers(self): result = re.sub(r'\d+', '', self.text) return result def convert_number(self): # split string into list of words temp_str = self.text.split() # initialise empty list new_string = [] for word in temp_str: # if word is a digit, convert the digit # to numbers and append into the new_string list if word.isdigit(): temp = p.number_to_words(word) new_string.append(temp) # append the word as it is else: new_string.append(word) # join the words of new_string to form a string temp_str = ' '.join(new_string) return temp_str def remove_punctuation(self): translator = str.maketrans('', '', string.punctuation) return self.text.translate(translator) def remove_whitespace(self): return " ".join(self.text.split()) def remove_stopwords(self): """custom function to remove the stopwords""" return " ".join([word for word in str(self.text).split() if word not in self.STOPWORDS]) def stem_words(self): return " ".join([self.stemmer.stem(word) for word in self.text.split()]) def lemmatize_words(self): return " ".join([self.lemmatizer.lemmatize(word) for word in self.text.split()]) def remove_freqwords(self, df, column_name): """custom function to remove the frequent words""" cnt = Counter() for self.text in df["text_wo_stop"].values: for word in self.text.split(): cnt[word] += 1 FREQWORDS = set([w for (w, wc) in cnt.most_common(10)]) return " ".join([word for word in str(self.text).split() if word not in FREQWORDS]) def remove_emoji(self): emoji_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) return emoji_pattern.sub(r'', self.text) def remove_emoticons(self): emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')') return emoticon_pattern.sub(r'', self.text) def convert_emoticons(self): for emot in EMOTICONS: text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), self.text) return text def remove_urls(self): url_pattern = re.compile(r'https?://\S+|www\.\S+') return url_pattern.sub(r'', self.text) def remove_html(self): html_pattern = re.compile('<.*?>') return html_pattern.sub(r'', self.text) def correct_spellings(self): corrected_text = [] misspelled_words = self.spell.unknown(self.text.split()) for word in self.text.split(): if word in misspelled_words: corrected_text.append(self.spell.correction(word)) else: corrected_text.append(word) return " ".join(corrected_text) def NER(self): doc = self.nlp(self.text) entity_label_map = dict() for entity in doc.ents: entity_label_map[entity.self.text] = entity.label_ return entity_label_map
def get_lemmatized_text(corpus): from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]
from nltk.stem import WordNetLemmatizer ''' lemmatizing - better than stemming as it gives actual word with meaning. can also club lot of words together. ''' lemmatizer = WordNetLemmatizer() print(lemmatizer.lemmatize('cats')) print(lemmatizer.lemmatize('better')) print(lemmatizer.lemmatize('best', pos='a'))
lemma = WordNetLemmatizer() news = pd.read_csv(r" news.csv") data=news.drop(['Unnamed: 0'],axis=1) TEXTdata=[] TITLEdata=[] for i in range(len(news)): data['text'].iloc[i] = re.sub('[^a-zAZ]',' ',data['text'].iloc[i]).lower() data['title'].iloc[i] = re.sub('[^a-zAZ]',' ',data['title'].iloc[i]).lower() textword = word_tokenize(data['text'].iloc[i]) titleword = word_tokenize(data['title'].iloc[i]) text="" title="" for w in textword: if w not in stop_words: wr = lemma.lemmatize(w) text=text+" "+wr for k in titleword: if k not in stop_words: kr = lemma.lemmatize(k) title=title+" "+kr TEXTdata.append(text) TITLEdata.append(title) #Vectorisation of data to produce training data and labels Y=[] for i in range(len(data)): if data['label'].iloc[i] == 'FAKE': Y.append(1) elif data['label'].iloc[i] == 'REAL': Y.append(0)
from nltk.stem import SnowballStemmer from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords import numpy as np import pickle import enchant from torch.autograd import Variable from string import digits #### Please download nltk resources before using this file. nltk.download("stopwords") print(stopwords.words('english')) nltk.download('wordnet') wordnet_lematizer = WordNetLemmatizer() print(wordnet_lematizer.lemmatize('good')) nltk.download('averaged_perceptron_tagger') print(nltk.pos_tag(['do','yes'])) class Myarticles(data.Dataset): def __init__(self, csvfile_path, txt_folder_path, glove_path='/Users/duanyiqun/Downloads/Textcls/glove.6B', validation=False): self.glove_path =glove_path #self.glove_init() #self.w2v = self.init_word2vec() self.articleor = self.Creat_article_list(csvfile_path) if validation: self.articles = self.articleor[200:250] else: self.articles = self.articleor[0:200] #self.articles = self.articleor[0:10] self.folderpath = txt_folder_path
text = [i.lower() for i in text] # To convert into lower case stop = set(stopwords.words('english')) text = [word_tokenize(i) for i in text] tweets = [] text1 = [] for i in text: i = [w for w in i if not w in stop] # To remove stop words i = [w for w in i if not re.search(r'^-?[0-9]+(.[0-9]+)?$', w)] # To remove numbers text1.append(i) wordnet_lemmatizer = WordNetLemmatizer() lemmatized_token = [] for sent in text1: # Lemmatization to convert tokens to canonical form tweets = [] for token in sent: token = wordnet_lemmatizer.lemmatize(token) token = wordnet_lemmatizer.lemmatize(token, pos='v') tweets.append(token) lemmatized_token.append(tweets) ########################################################################## # COMMENTED INTENTIONALLY # # Code to find top 100 words as per frequency to check for unwanted and # # redundant words. Added stop words based on this analysis. # ########################################################################## #from collections import Counter #count=[] #for i in lemmatized_token: # for j in i: # count.append(j) #count
outputFile = outputFile.replace('\n', ' ') outputFile = regex.sub("'", "", outputFile) outputFile = regex.split('\W+', outputFile) training_data_lst.append({'doc': outputFile, 'class': class_name}) classes_quan[training_data_lst[len(training_data_lst) - 1] ['class']] = classes_quan[training_data_lst[ len(training_data_lst) - 1]['class']] + 1 for j in range( 0, len(training_data_lst[len(training_data_lst) - 1]['doc'])): training_data_lst[len(training_data_lst) - 1]['doc'][j] = lemmatizer.lemmatize( training_data_lst[len(training_data_lst) - 1]['doc'][j].lower()) if (not (training_data_lst[len(training_data_lst) - 1]['doc'][j] in stopWord) ) and (not ( training_data_lst[len(training_data_lst) - 1]['doc'][j] in tokens_document)) and ( len(training_data_lst[len(training_data_lst) - 1]['doc'][j]) > 1 ): # removes words which doesn't exist in stopword and distinct token list tokens_document.append( training_data_lst[len(training_data_lst) - 1]['doc'][j]) doc = list( set(training_data_lst[len(training_data_lst) - 1]['doc']))
from nltk.stem import WordNetLemmatizer tester = 1 lemmatizer = WordNetLemmatizer() documents = df_refined # Tokenize words from nltk.tokenize import word_tokenize from nltk import download download('punkt') documents_tokenized = [word_tokenize(document) for document in documents] # lemmattizing tokens (better than stemming by taking word context into account) documents_tokenized_lemmatized = [[lemmatizer.lemmatize(token) for token in text] for text in documents_tokenized] from nltk.sentiment.util import mark_negation documents_tokenized_lemmatized_negated = [mark_negation(document) for document in documents_tokenized_lemmatized] ready_corpus=documents_tokenized_lemmatized_negated download('opinion_lexicon') from nltk.corpus import opinion_lexicon # we consider only sentiment words, opinion_lexicon icludes already mispelled sentiment words, # so we did not use the enchant library this time. sentiment_words= opinion_lexicon.words() sentiment_words_negated= [word+'_NEG' for word in sentiment_words]
# In[4]: #removing stop words from the list of words no_stop_words1 = [word for word in words1 if word not in stop_words] no_stop_words2 = [word for word in words2 if word not in stop_words] no_stop_words3 = [word for word in words3 if word not in stop_words] no_stop_words4 = [word for word in words4 if word not in stop_words] # In[5]: #Choose to lemmetize as lemmetizing does not only cut off the end part but reduces to base form from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() lemmatized_words1 = [lemmatizer.lemmatize(x) for x in no_stop_words1] lemmatized_words2 = [lemmatizer.lemmatize(x) for x in no_stop_words2] lemmatized_words3 = [lemmatizer.lemmatize(x) for x in no_stop_words3] lemmatized_words4 = [lemmatizer.lemmatize(x) for x in no_stop_words4] # In[6]: from nltk.stem.porter import * stemmer = PorterStemmer() stemmed_words1 = [stemmer.stem(x) for x in lemmatized_words1] stemmed_words2 = [stemmer.stem(x) for x in lemmatized_words2] stemmed_words3 = [stemmer.stem(x) for x in lemmatized_words3] stemmed_words4 = [stemmer.stem(x) for x in lemmatized_words4] # In[7]:
print(POS_tag) # nltk.download('wordnet') from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() adjective_tags = ['JJ', 'JJR', 'JJS'] lemmatized_text = [] for word in POS_tag: if word[1] in adjective_tags: lemmatized_text.append( str(wordnet_lemmatizer.lemmatize(word[0], pos="a"))) else: lemmatized_text.append(str(wordnet_lemmatizer.lemmatize( word[0]))) # default POS = noun print("Text tokens after lemmatization of adjectives and nouns: \n") print(lemmatized_text) POS_tag = nltk.pos_tag(lemmatized_text) print("Lemmatized text with POS tags: \n") print(POS_tag) stopwords = [] wanted_POS = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS', 'VBG', 'FW']
def tokenize(text): '''process the text into cleaned tokens The text is processed by removing links,emails, ips, keeping only alphabet a-z in lower case, then test split into individual tokens, stop word is removed, and words lemmatized to their original stem Args: text (str): a message in text form Returns: clean_tokens (array): array of words after processing ''' url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' emails_regex = '[a-zA-Z0-9+_\-\.]+@[0-9a-zA-Z][.-0-9a-zA-Z]*.[a-zA-Z]+' ips_regex = '(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})' stopword_list = stopwords.words('english') placeholder_list = ['urlplaceholder', 'emailplaceholder', 'ipplaceholder'] # Remove extra paranthesis for better URL detection text = text.replace("(", "") text = text.replace(")", "") # get list of all urls/emails/ips using regex detected_urls = re.findall(url_regex, text) detected_emails = re.findall(emails_regex, text) # remove white spaces detected ar end of some urls detected_emails = [email.split()[0] for email in detected_emails] detected_ips = re.findall(ips_regex, text) # Remove numbers and special characters, help down vocab size pattern = re.compile(r'[^a-zA-Z]') stopword_list = stopwords.words('english') for url in detected_urls: text = re.sub(url, 'urlplaceholder', text) for email in detected_emails: text = re.sub(email, 'emailplaceholder', text) for ip in detected_ips: text = re.sub(ip, 'ipplaceholder', text) for stop_word in stopword_list: if (stop_word in text): text.replace(stop_word, '') # remove everything except letetrs text = re.sub(pattern, ' ', text) # initilize tokens = word_tokenize(text.lower()) lemmatizer = WordNetLemmatizer() clean_tokens = [] for tok in tokens: if ((tok not in stopword_list) and (tok not in placeholder_list) and len(tok) > 2): clean_tok = lemmatizer.lemmatize(lemmatizer.lemmatize(tok.strip()), pos='v') # Remove Stemmer for better word recognition in app #clean_tok = PorterStemmer().stem(clean_tok) clean_tokens.append(clean_tok) return clean_tokens
import nltk from nltk.stem import PorterStemmer from nltk.stem import WordNetLemmatizer stemmer_output = PorterStemmer() print(stemmer_output.stem('happiness')) lemmatizer_output = WordNetLemmatizer() print(lemmatizer_output.lemmatize('happiness'))
# replace internal whitespace with underscores, # remove all non-alphabet characters (numbers, punctuation) cleaned_tags = [ re.sub(r"\s+", '_', re.sub("[^a-zA-z\s+]+", '', t.lower()).strip()) for t in tags ] tag_df['Cleaned_Tag'] = cleaned_tags print len(set(cleaned_tags)) # 9151 post-cleaning # # Stemming -- computationally quicker, but lemma # # is preferable due to higher level of sophistication. # porter_stemmer = PorterStemmer() # stemmed_tags = [porter_stemmer.stem(t) # for t in cleaned_tags] # tag_df['Stemmed_Tag'] = stemmed_tags # print len(set(stemmed_tags)) # # 7517 post-stemming # Lemmatization wordnet_lemmatizer = WordNetLemmatizer() lemma_tags = [wordnet_lemmatizer.lemmatize(t) for t in cleaned_tags] tag_df['Lemmatized_Tag'] = lemma_tags print len(set(lemma_tags)) # 8455 unique lemmatized_tags print len(set([t for t in lemma_tags if '_' in t])) # 1181 of which are multi_word with open('data/cleaned_tags.pickle', 'wb') as f: pickle.dump(tag_df, f)
def lemma(word, part): n = WordNetLemmatizer() return n.lemmatize(word, part)
df = pd.read_csv(file) ratings = df.Rating headers = df.Header reviews = df.Review products = df.Product new_headers = list() new_reviews = list() print("Checking headers") for header in headers: if type(header) != float: new_header = list() words = tokenizer.tokenize(header) for word in words: lemma = lmtzr.lemmatize(word.lower()) new_header.append(lemma) new_headers.append(' '.join(new_header)) else: new_headers.append('') print("Checking reviews") for review in reviews: if type(review) != float: new_review = list() words = tokenizer.tokenize(review) for word in words: lemma = lmtzr.lemmatize(word.lower()) new_review.append(lemma) new_reviews.append(' '.join(new_review)) else:
# stemming-提取词干 # 导入stem.porter和Lancaster工具包 from nltk.stem.porter import PorterStemmer from nltk.stem.lancaster import LancasterStemmer # 实例化PosterStemmer对象 porter_stemmer = PorterStemmer() # 实例化LancasterStemmer对象 lancaster_stemmer = LancasterStemmer() # 新建stemmed_list和lancaster_list数组,用于分别存放PorterStemmer和LancasterStemmer的结果 stemmed_list = [] lancaster_list = [] for token in tokens: stemmed_list.append(porter_stemmer.stem(token)) lancaster_list.append(lancaster_stemmer.stem(token)) print("提取词干结果:") print("1.PorterStemmer:", stemmed_list) print("2.LancasterStemmer:", lancaster_list) # Lemmatization-词形还原 # nltk的Lemmatization是基于WordNet实现的,导入WordNetLemmatizer。 from nltk.stem import WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() # 新建lem_list数组,用于存放词形还原 lem_list = [] for token in tokens: lem_list.append(wordnet_lemmatizer.lemmatize(token)) print("词形还原结果:") print(lem_list)
def Lemmatize_words(self,words): lemmatizer = WordNetLemmatizer() l_words = [lemmatizer.lemmatize(w) for w in words] return " ".join(l_words)
t = line.split() for i in range(len(t)): if len(t) > 2 and (t[i].startswith("(NN") or t[i].startswith("NN") ) and not t[i].startswith("(NNP"): # get noun noun = t[i + 1].strip("))").lower() sentence_nouns.append(noun) if noun not in d: d[noun] = {} for j in range(len(t)): # verb stuff here if len(t) > 2 and (t[j].startswith("(VB")): verb = t[j + 1].strip(")").lower() verb = wnl.lemmatize(verb, 'v') for n in sentence_nouns: counts = d[n] if verb in d[n]: counts[verb] = counts[verb] + 1 else: counts[verb] = 1 d[n] = counts print("finished " + filename + "!") for noun in d: temp = [] for key, value in sorted(d[noun].items(), key=itemgetter(1), reverse=True): temp.append((key, value))
def lem_abstract_pd(df): word_lem = WordNetLemmatizer() df['abstract_cleaned'] = df['abstract_cleaned'].apply( lambda x: [word_lem.lemmatize(y) for y in x]) return df
try: save_json(line_words, config.PATH_WORDS) except: os.remove(config.PATH_WORDS) exit(1) # 4. Lemmatization using NLTK tool if os.path.exists(config.PATH_LEM_WORDS): lemma_line_words = load_json(config.PATH_LEM_WORDS) else: lemmatizer = WordNetLemmatizer() # first make a copy lemma_line_words = line_words.copy() for line_id, line_word in enumerate(line_words): for word_id, word in enumerate(line_word): lemma_line_words[line_id][word_id] = lemmatizer.lemmatize(word) try: save_json(lemma_line_words, config.PATH_LEM_WORDS) except: os.remove(config.PATH_LEM_WORDS) exit(1) # 5. remove stopword using spacy nlp = spacy.load('en_core_web_sm') stop_words = nlp.Defaults.stop_words if os.path.exists(config.PATH_NO_STOP): no_stop_line_words = load_json(config.PATH_NO_STOP) else: no_stop_line_words = [] for line_id, line_word in enumerate(lemma_line_words):
class LemmaTokenizer(object): def __init__(self): self.wnl = WordNetLemmatizer() def __call__(self, doc): return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
for intent in intents['intents']: for pattern in intent['patterns']: #tokenize each word w = nltk.word_tokenize(pattern) words.extend(w) #add documents in the corpus documents.append((w, intent['tag'])) # add to our classes list if intent['tag'] not in classes: classes.append(intent['tag']) # lemmaztize and lower each word and remove duplicates words = [ lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words ] words = sorted(list(set(words))) # sort classes classes = sorted(list(set(classes))) # documents = combination between patterns and intents print(len(documents), "documents") # classes = intents print(len(classes), "classes", classes) # words = all words, vocabulary print(len(words), "unique lemmatized words", words) pickle.dump(words, open('words.pkl', 'wb')) pickle.dump(classes, open('classes.pkl', 'wb')) # create our training data
elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: return wordnet.NOUN def penn_to_wn(tag): return get_wordnet_pos(tag) for i in range(len(corpus), len(dataset)): review = dataset['text'][i] review = [ lemmatizer.lemmatize(word, pos=penn_to_wn(nltk.pos_tag([word])[0][1])) for word in word_tokenize(review) if word not in string.punctuation ] review = ' '.join(review) corpus.append(review) gc.collect() #print(sys.getsizeof(corpus)) from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer(decode_error='ignore', stop_words='english', lowercase=True, binary=False, analyzer='word', token_pattern='[A-z]{3,}', ngram_range=(1, 1),
''.join(c for c in s if c not in punctuation) for s in word_tokens ] # remove empty strings word_tokens = [s for s in word_tokens if s] # Removing stop words — frequent words such as ”the”, ”is”, etc. that do not have specific semantic to further cleanup the text corpus. from nltk.corpus import stopwords stop_words = set(stopwords.words('english')) filtered_tokens = [w for w in word_tokens if not w in stop_words] # Lemmatisation unlike Stemming, reduces the inflected words properly ensuring that the root word belongs to the language. from nltk.stem import WordNetLemmatizer # init the wordnet lemmatizer lmtzr = WordNetLemmatizer() lemm_tokens = [lmtzr.lemmatize(x) for x in filtered_tokens] import nltk bigrams = nltk.collocations.BigramAssocMeasures() trigrams = nltk.collocations.TrigramAssocMeasures() bigramFinder = nltk.collocations.BigramCollocationFinder.from_words( word_tokens) trigramFinder = nltk.collocations.TrigramCollocationFinder.from_words( word_tokens) import pandas as pd #bigrams bigram_freq = bigramFinder.ngram_fd.items() bigramFreqTable = pd.DataFrame(list(bigram_freq),
import string table = str.maketrans('', '', string.punctuation) stripped = [w.translate(table) for w in tokens] # remove remaining tokens that are not alphabetic words = [word for word in stripped if word.isalpha()] # filter out stop words from nltk.corpus import stopwords stop_words = set(stopwords.words('english')) words = [w for w in words if not w in stop_words] #lemmatization from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() words = [lemmatizer.lemmatize(word) for word in words] CleanedText = ' '.join(words) from nltk.sentiment.vader import SentimentIntensityAnalyzer sid = SentimentIntensityAnalyzer() possentiment = sid.polarity_scores(CleanedText)['pos'] negsentiment = sid.polarity_scores(CleanedText)['neg'] comsentiment = sid.polarity_scores(CleanedText)['compound'] possentiments.append(possentiment) negsentiments.append(negsentiment) comsentiments.append(comsentiment) # In[9]: details = zip(dates, media, possentiments, negsentiments, comsentiments)
data_cleaned = [] for doc in groups.data: doc_cleaned = ' '.join(word for word in doc.split() if word.isalpha()) data_cleaned.append(doc_cleaned) from sklearn.feature_extraction import stop_words print(stop_words.ENGLISH_STOP_WORDS) from nltk.corpus import names all_names = set(names.words()) count_vector_sw = CountVectorizer(stop_words="english", max_features=500) from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() data_cleaned = [] for doc in groups.data: doc = doc.lower() doc_cleaned = ' '.join( lemmatizer.lemmatize(word) for word in doc.split() if word.isalpha() and word not in all_names) data_cleaned.append(doc_cleaned) data_cleaned_count = count_vector_sw.fit_transform(data_cleaned) print(count_vector_sw.get_feature_names_out()) # In[ ]:
class Myarticles(data.Dataset): def __init__(self, csvfile_path, txt_folder_path, glove_path='/Users/duanyiqun/Downloads/Textcls/glove.6B', validation=False): self.glove_path =glove_path #self.glove_init() #self.w2v = self.init_word2vec() self.articleor = self.Creat_article_list(csvfile_path) if validation: self.articles = self.articleor[200:250] else: self.articles = self.articleor[0:200] #self.articles = self.articleor[0:10] self.folderpath = txt_folder_path self.snowball_stemmer = SnowballStemmer('english') self.wordnet_lematizer = WordNetLemmatizer() self.delset = str.maketrans('', '', string.punctuation) self.remove_digits = str.maketrans('', '', digits) self.spelldict = enchant.Dict("en_US") self.init_word2idx() def est_dict(self,article_list): temp = [] for index, _ in enumerate(article_list): filepath = os.path.join(self.folderpath,self.articles[index][0]) print('analyze article {}'.format(index)) with open(filepath) as f: article = f.read() article = self.CleanLines(article) article = self.SenToken(article) article = self.tokenize_to_word(article) article = self.spell_check_words(article) article = self.steamize_words(article) temp = temp + article[0] vocab = set(temp) word_to_ix = {word: i for i, word in enumerate(vocab)} pickle.dump(word_to_ix, open(f'word2_idx.pkl', 'wb')) def save_wdx(self): self.est_dict(self.articles) def init_word2idx(self): self.word2idx = pickle.load(open(f'word2_idx.pkl', 'rb')) print('sucessfully load word dictionary with shape{}'.format(len(self.word2idx))) def __getitem__(self, index): filepath = os.path.join(self.folderpath,self.articles[index][0]) with open(filepath) as f: article = f.read() article = self.CleanLines(article) article = self.SenToken(article) article = self.tokenize_to_word(article) article = self.spell_check_words(article) article = self.steamize_words(article) article = self.vectorize(article) sample = Variable(torch.from_numpy(article[0])) target = self.articles[index][1] return sample, target def Creat_article_list(self, csvfile, label= 'Basic '): df = pd.read_csv(csvfile) article_list = [] for idx, cont in enumerate(df['Basics ']): if cont != 'NA': article_list.append([df['Name'][idx],cont]) return article_list def SenToken(self,raw):#分割成句子 sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') sents = sent_tokenizer.tokenize(raw) return sents def CleanLines(self,line): #cleanline = re.sub('[:*~@^&()_+|\/><,.!\']', '', line) #delset = str.maketrans('', '', string.punctuation) cleanline = line.translate(self.delset) #cleanline = re.sub('0123456789', '', cleanline) #remove_digits = str.maketrans('', '', digits) cleanline = cleanline.translate(self.remove_digits) return cleanline def tokenize_to_word(self, article): words_tokenized =[] for sentence in article: sentence = self.CleanLines(sentence) sentence = nltk.word_tokenize(sentence) words_tokenized.append(sentence) return words_tokenized def steamize_words(self,article): for idx, sentence in enumerate(article): for ind, word in enumerate(sentence): word = self.snowball_stemmer.stem(word) sentence[ind] = self.wordnet_lematizer.lemmatize(word) sentence = [word for word in sentence if word not in stopwords.words('english')] article[idx] = sentence return article def spell_check_words(self,article): for idx, sentence in enumerate(article): for ind, word in enumerate(sentence): if not self.spelldict.check(word): if self.spelldict.suggest(word) != []: sentence[ind] = self.spelldict.suggest(word)[0] else: sentence[ind] = ' ' article[idx] = sentence return article def vectorize(self,article): temp = [] for idx, sentence in enumerate(article): for ind, word in enumerate(sentence): sentence[ind] = self.word2idx[word] temp.append(sentence) return np.array(temp) def tag_mask(self,article): mask = [] for idx, sentence in enumerate(article): mask.append(nltk.pos_tag(sentence)) return mask """ def glove_init(self): words = [] idx = 0 word2idx = {} vectors = bcolz.carray(np.zeros(1), rootdir=f'{self.glove_path}/6B.50.dat', mode='w') with open(f'{self.glove_path}/glove.6B.50d.txt', 'rb') as f: #idx =0 for l in f: line = l.decode().split() word = line[0] words.append(word) word2idx[word] = idx idx += 1 vect = np.array(line[1:]).astype(np.float) vectors.append(vect) #print(idx+1) vectors = bcolz.carray(vectors[1:].reshape((400001, 50)), rootdir=f'{self.glove_path}/6B.50.dat', mode='w') vectors.flush() pickle.dump(words, open(f'{self.glove_path}/6B.50_words.pkl', 'wb')) pickle.dump(word2idx, open(f'{self.glove_path}/6B.50_idx.pkl', 'wb')) def init_word2vec(self): vectors = bcolz.open(f'{self.glove_path}/6B.50.dat')[:] words = pickle.load(open(f'{self.glove_path}/6B.50_words.pkl', 'rb')) word2idx = pickle.load(open(f'{self.glove_path}/6B.50_idx.pkl', 'rb')) glove = {w: vectors[word2idx[w]] for w in words} return glove def word2map(self, article): matrix_len = 0 for idx, sentence in enumerate(article): if matrix_len<len(sentence): matrix_len = len(sentence) vecarticle = [] for idx, sentence in enumerate(article): #words_found = 0 weights_matrix = np.zeros((matrix_len, 50)) for i, word in enumerate(sentence): try: weights_matrix[i] = self.w2v[word] words_found += 1 except: print('key not founded, initialized random weights') weights_matrix[i] = np.random.normal(scale=0.6, size=(50, )) vecarticle.append(weights_matrix) return vecarticle """ def __len__(self): return len(self.articles)
# Creating nice bags of words # Import WordNetLemmatizer from nltk.stem import WordNetLemmatizer # Retain alphabetic words: alpha_only alpha_only = [t for t in lower_tokens if t.isalpha()] # Remove all stop words: no_stops no_stops = [t for t in alpha_only if t not in english_stops] # Instantiate the WordNetLemmatizer wordnet_lemmatizer = WordNetLemmatizer() # Lemmatize all tokens into a new list: lemmatized lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops] # Create the bag-of-words: bow bow = Counter(lemmatized) # Print the 10 most common tokens print(bow.most_common(10)) # Using gensim # Import Dictionary from gensim.corpora.dictionary import Dictionary # Create a Dictionary from the articles: dictionary dictionary = Dictionary(articles)
temp1 = [] temp2 = [] simi = [] final = [] same_sent1 = [] same_sent2 = [] lemmatizer = WordNetLemmatizer() for words1 in word_tokenize(str1): if words1 not in stop_words: if words1.isalnum(): filtered_sentence1.append(words1) for i in filtered_sentence1: lemm_sentence1.append(lemmatizer.lemmatize(i)) for words2 in word_tokenize(str2): if words2 not in stop_words: if words2.isalnum(): filtered_sentence2.append(words2) for i in filtered_sentence2: lemm_sentence2.append(lemmatizer.lemmatize(i)) for word1 in lemm_sentence1: simi =[] for word2 in lemm_sentence2: sims = [] syns1 = wordnet.synsets(word1) syns2 = wordnet.synsets(word2)