def add_sample(self, sample): if not isinstance(sample, str): raise TypeError # Calling add_sample should replace existing sample. # To avoid appending new values onto existing lists: self.sample = sample self.misspelled_words = [] self.tokenized_sample = [] self.tagged_sample = {} sample = sample.replace('\n', " ") sample = sample.rstrip(" ") for char in punctuation.replace("'", ""): sample = sample.replace(char, "") tokens = word_tokenize(sample) for word in tokens: if word.lower() in words.words(): self.tokenized_sample.append(word) elif word.capitalize() in names.words(): continue elif "'" in word: self.tokenized_sample.append(word) elif LEMMATIZER.lemmatize(word.lower()) not in words.words(): if STEMMER.stem(word.lower()) not in words.words(): self.misspelled_words.append(word) else: self.tokenized_sample.append(word) self.tagged_sample = pos_tag(tokens)
def divide(s): first = '' for i in range(len(str(s))): first += s[i] print first if first in words.words() and s[i + 1:] in words.words(): return ' '.join([first, s[i + 1:]]) return False
def raw_files_to_labeled_features(raw_files, label_file): # Initialize spark conf = SparkConf().setAppName("SpamFilter").setMaster("local[*]") sc = SparkContext(conf=conf) # Get the set of words that we will be accepting as valid features valid_words = set(w.lower() for w in words.words()) # Load training data and convert to our desired format raw_files = sc.wholeTextFiles(raw_files) # Extract a document of filtered words from each text file documents = raw_files.map(lambda x: (x[0], extract_words(x[1], valid_words))) # Calculate TF-IDF values for each document tfidf = calculate_tfidf(documents) # Load labels labels = sc.parallelize(load_labels(label_file)).map(lambda x: x[0]) # Append indexes to features and labels indexed_labels = labels.zipWithIndex().map(lambda x: (x[1],x[0])) indexed_features = tfidf.zipWithIndex().map(lambda x: (x[1],x[0])) # Join labels and features into tuples and return return indexed_labels.join(indexed_features).map(lambda x: x[1]).collect()
def _english_wordlist(self): try: wl = self._en_wordlist except AttributeError: from nltk.corpus import words wl = self._en_wordlist = set(words.words('en-basic')) return wl
def get_vocab(): word_list = words.words() lowercased = [t.lower() for t in word_list] STEMMER = PorterStemmer() stemmed = [STEMMER.stem(w) for w in lowercased] vocab = list(set(stemmed)) return vocab
def get_english_vocab(lemmatize=False): vocab = (w.lower() for w in words.words()) if lemmatize: stemmer = PorterStemmer() vocab = (stemmer.stem(w) for w in vocab) return set(vocab)
def __init__(self): self.stopwords = stopwords.words('english') self.uscities = set([w.lower() for w in gazetteers.words('uscities.txt')]) self.usstates = set([w.lower() for w in gazetteers.words('usstates.txt')]) self.countries = set([w.lower() for w in gazetteers.words('countries.txt')]) self.basicwords = set(words.words('en-basic')) self.paragraph_tokens = [] self.texts = []
def unknown(list): k = re.findall(r'(?<= )+[a-z]+\b', textString) # Removes punctuation and capitalized words print(textString) for w in k: # Gets all the words if(w not in words.words()): # If website words arent in NLTK word dictionary: unW.append(w) # Adds the word to the unknown list print (unW) # Prints words that are not in the NLTK word dictionary
def __init__(self, dict_path = '/etc/dictionaries-common/words'): f = open(dict_path) # We use two dictionaries for better coverage d1 = set([w.lower() for w in f.read().split()]) d2 = set([w.lower() for w in words.words()]) self.words = set(d1.union(d2))
def unknown(url): # get the HTML, as a string html = str(bs(urllib.urlopen(url).read())) # find all substrings substrings = set(re.findall(r'[a-z]+', html)) # specify the wordlist wordlist = words.words() # return the words not in the wordlist return [word for word in substrings if word not in wordlist]
def __init__(self, corpora_list=['all_plaintext.txt', 'big.txt'], parse_args=(True, True, True, True, True)): #Set the parsing arguments self.remove_stopwords = parse_args[0] self.tag_numeric = parse_args[1] self.correct_spelling = parse_args[2] self.kill_nonwords = parse_args[3] self.stem = parse_args[4] #Alphabet self.alphabet = 'abcdefghijklmnopqrstuvwxyz' #Punctuation self.punc_dict = {ord(c): None for c in string.punctuation} #Reserved tags self.reserved_tags = ['numeric_type_hex', 'numeric_type_binary', 'numeric_type_octal', 'numeric_type_float', 'numeric_type_int', 'numeric_type_complex', 'numeric_type_roman', 'math_type'] #Update the set of nltk words with the additional corpora self.all_words = set(words.words()) self.all_words.update('a') self.all_words.update('i') self.all_words.update(self.reserved_tags) self.max_word_length = 20 #Set up the stopwords, remove 'a' due to math issues self.stops = set(stopwords.words("english")) self.stops.remove('a') self.stops.remove('no') #Set up the stemmer self.st = SnowballStemmer('english') #Train the spelling corrector using all corpora train_text = '' for cfile in corpora_list: words_in_file = file(cfile).read() self.all_words.update(self.get_all_words(file(cfile).read())) train_text = train_text + words_in_file #Remove single character terms wordlist = list(self.all_words) wordlist = [i for i in wordlist if len(i) > 1] self.all_words = set(wordlist) self.all_words.update('a') self.all_words.update('i') self.NWORDS = self.train(self.get_all_words(train_text))
def tokenize4(text): wordnet_lemmatizer = WordNetLemmatizer() tokens = word_tokenize(text) wordset = set(words.words()) tokens = [wordnet_lemmatizer.lemmatize(token, NOUN) for token in tokens] tokens = [wordnet_lemmatizer.lemmatize(token, VERB) for token in tokens] tokens = [wordnet_lemmatizer.lemmatize(token, ADJ) for token in tokens] tokens = [token for token in tokens if token in wordset] return tokens
def extractingFromFolders(): folder2 = os.path.expanduser('~\\My Documents\\Tara\\Ongoing\\CharacterCorpus\\Reference') fileresult = os.path.expanduser('~\\My Documents\\Tara\\Ongoing\\CharacterCorpus\\results.txt') refer = PlaintextCorpusReader(folder2, 'harrygrepster.txt') grepster = refer.words() results = open(fileresult, 'a') completeWords = wordlist.words() stoppers = stopwords.words() return grepster, results, completeWords, stoppers
def unknown(url): # get the HTML, as a string html = str(bs(urllib.urlopen(url).read())) # find all substrings substrings = set(re.findall(r"\s+([a-zA-Z]+)\s+", html)) substrings = [word.lower() for word in substrings] # specify the wordlist wordlist = words.words() # return the words not in the wordlist return [word for word in substrings if word not in wordlist]
def textParse(file): processedText = '' with open(file, 'r') as f: lines = f.read().splitlines() for line in lines: wordsInLine = line.split(' ') for word in wordsInLine: # print '*'+word+'*' if word.lower() in words.words(): processedText += word + ' ' return processedText
def getReadabilityScore(tweet): w1 = tweet.split(" ") ASL1 = len(w1) AOV1 = 0 l = 0 for w in w1: l+=len(w) if(w not in words.words()): AOV1+=1 ASW1 = l/float(ASL1) S1 = 206.835 - (1.015*ASL1) - (84.6*ASW1)- (10.5*AOV1) return S1
def exercise_unusual_words(): text = gutenberg.words("austen-sense.txt") # 取出文本中的词汇, 去除数字, 转换为小写 text_vocab = set(w.lower() for w in text if w.isalpha()) # 取出词典中的词汇 english_vocab = set(w.lower() for w in words.words()) # 找出文本中的非常用词汇(错误词汇) unusual_vocab = text_vocab.difference(english_vocab) print sorted(unusual_vocab)
def anagrams_for(word): # TODO: # 1. generate permutations of word array (note: don't return itself) # 2. check if word is a real word using the syntax below: # if "word" in words.words(): # print word # example using itertools: list(itertools.permutations([1,2,3,4], 2)) word_array = array('u', word) [print new_word.tobytes() for new_word in list(itertools.permutations(word_array, word_array.length)) if new_word in words.words())]
def precomputeFromNLTK(): """ precompute with nltk's corpus as wordbase """ language = set() print(len(words.words())) for word in words.words(): word = word.lower() sortW = "".join(char for char in sorted(word)) if sortW[0] >= "a" and sortW[0] <= "z": word = word + ":" + sortW language.add(word) print("Loaded %d words from NLTK wordnet" % (len(language))) buckets = [set() for x in xrange(25)] for word in language: buckets[len(word) / 2].add(word) count = 0 for word in language: if count % 1000 == 0: print("Done for %d words" % count) count += 1 sortedW = word.split(":")[1] if sortedW not in nltkHashMap: nltkHashMap[sortedW] = set() for word2 in buckets[len(sortedW)]: sortedW2 = word2.split(":")[1] if sortedW == sortedW2: nltkHashMap[sortedW].add(word2.split(":")[0]) file = open(nltkAnagramsFile, "w") file.truncate() count = 0 for anagrams, listOfAnagrams in nltkHashMap.items(): if count % 1000 == 0: print("%d anagram lists written" % count) file.flush() count += 1 file.write("%s:%s\n" % (anagrams, listOfAnagrams)) file.close() print("Precomputation with NLTK done")
def unknown(url): """Takes a URL as its argument and returns a list of unknown words that occur on that webpage.""" # gets the text of the page html = request.urlopen(url).read().decode('utf8') raw = BeautifulSoup(html).get_text() junk = set(words.words()) # finds the lower case words by searching for a word boundary plus one or more lower case letters lower_case_words = re.findall(r'\b[a-z]+', raw) # searches through the list of lower case words and gets rid of those not in the words corpus. unknowns = [word for word in lower_case_words if word not in junk] print(unknowns)
def __init__(self,conf): self.translation = conf.translation.translate self.translation_store = conf.translation.store self.translate_threshold = conf.translation.threshold self.translate_failed = False self.translate_status = False self.text = "" self.source = "" self.tokens= [] self.tweet_id = "" if self.translation: self.english_vocab = set(w.lower() for w in words.words()) self.stopfile = conf.stopwords.filename self.stopset = set(open(self.stopfile, 'r').read().split())
def __init__(self): self.dict_anagrams = {} for t in words.words(): word = str.lower(str(t)) word = word.replace('-',' ') alpha_count = get_alphaCount(word) if alpha_count in self.dict_anagrams: self.dict_anagrams[alpha_count].add(word) else: self.dict_anagrams[alpha_count] = {word}
def main(): """ Let's go through the words and return a list of nick names for Diane Stitt .... yes! """ stitt_dict = {} for word in words.words('en'): # find a word that includes two t's if find_pair(word): data_structure(add_stitt(find_pair(word)), # Add Stitt to word word, # include original word for comparison stitt_dict) # use a dict to group nickname dups # return a dictionary where each key is a program generated nickname # mapped to a list of all words that generate that nickname return stitt_dict
def baselineCorrector(word): arr = sorted(words.words()) if word in arr: return word low = 0 high = len(arr) - 1 while True: mid = (low + high) / 2 midval = arr[mid] if high <= low: return arr[mid+1] if midval < word else midval if midval < word: low = mid + 1 elif midval > word: high = mid - 1
def build_dictionary_index(): global dict_index s = set(words.words('en')) for word in s: if len(word) > 2: # Ignore smallest words word = str.lower(word) try: dict_index[word[0]] try: dict_index[word[0]][len(word)] += [word] except KeyError: dict_index[word[0]][len(word)] = [word] except TypeError: print dict_index[word[0]][len(word)] except KeyError: dict_index[word[0]] = {len(word):[word]}
def DeleteDictWords(data,dictEN, dictFR, french_dict_path): if dictEN == 1: english_dict = words.words('en') else: english_dict=[] if dictFR == 1: french_dict = [line.rstrip('\n') for line in open(french_dict_path, 'r' ,encoding='utf-8')] else : french_dict = [] dico = english_dict + french_dict col_dico = [i for i in data.columns.values if i in dico] return pd.DataFrame(data, columns = col_dico)
def word_finder(): from nltk.corpus import words wordlist = words.words() random.shuffle(wordlist) wordlist = wordlist[:200] wordlist = [w for w in wordlist if 3 <= len(w) <= 12] grid, used = wordfinder(wordlist) print("Word Finder\n") for i in range(len(grid)): for j in range(len(grid[i])): print(grid[i][j], end=' ') print() print() for i in range(len(used)): print("%d:" % (i+1), used[i])
def make_index(): s = set(words.words('en')) index = {} for word in s: if len(word) > 2: word = str.lower(word) try: index[word[0]] try: index[word[0]][len(word)] += [word] except KeyError: index[word[0]][len(word)] = [word] except TypeError: print index[word[0]][len(word)] except KeyError: index[word[0]] = {len(word):[word]} return index
def clean_pdf( search_file ): search_file = 'scraped_'+search_file+'.txt' file = open(search_file, 'r') text = file.read() text_only = BeautifulSoup(text).get_text() rm_symbol = re.sub(r'[^\w]', ' ', text_only) letters_only = re.sub("[^a-zA-Z]", " ", rm_symbol ) lower_case = letters_only.lower() words_text = set(lower_case.split()) stops = set(stopwords.words("english")) english_words = words.words() rm_stopwords = [w for w in words_text if not w in stops] # meaningful_words = [w for w in rm_stopwords if w in english_words] # In order to select the meaningful words, we can either select those that belong to the english dictionary, or instead suppress those that belong to the dictionary, in this way only those terms specific to the subject will count. # This made me realize, this is an awesome way of fixing typos, look for those words that DO NOT belong to the dictionary! # meaningful_words = words - stops meaningful_text = ' '.join(rm_stopwords)#meaningful_words return meaningful_text
def demo(): from nltk.corpus import words wordlist = words.words() random.shuffle(wordlist) wordlist = wordlist[:200] wordlist = [w for w in wordlist if 3 <= len(w) <= 12] grid, used = wordfinder(wordlist) print "Word Finder\n" for i in range(len(grid)): for j in range(len(grid[i])): print grid[i][j], print print for i in range(len(used)): print "%d:" % (i + 1), used[i]
#!/usr/bin/python # -*- coding: utf-8 -*- from nltk.corpus import names, stopwords, words print(words.fileids()) print(words.words('en')) # doctest: +ELLIPSIS print(stopwords.fileids()) # doctest: +ELLIPSIS print(stopwords.words('portuguese')) # doctest: +ELLIPSIS print(names.fileids()) print(names.words('male.txt')) # doctest: +ELLIPSIS print(names.words('female.txt')) # doctest: +ELLIPSIS from nltk.corpus import cmudict print(cmudict.entries()[653:659]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # Load the entire cmudict corpus into a Python dictionary: transcr = cmudict.dict() print([transcr[w][0] for w in 'Natural Language Tool Kit'.lower().split()])
elif b[len(b)-2]==("/news"): def NewsFromBBC(): main_url = " https://newsapi.org/v1/articles?source=bbc-news&sortBy=top&apiKey=28761f98c19940c78675c38d6babb8b7" open_bbc_page = requests.get(main_url).json() article = open_bbc_page["articles"] results = [] for ar in article: results.append(ar["title"]) for i in range(len(results)): message.send_keys(i + 1, results[i]) message.send_keys(Keys.RETURN) if __name__ == '__main__': NewsFromBBC() elif b[len(b)-2]==("/hangman"): word_list = words.words() vowels = ["a", "e", "i", "o", "u"] message.send_keys("welcome to hangman ") time.sleep(1) message.send_keys(Keys.RETURN) chosenword = random.choice(word_list) message.send_keys("the length of the chosen word is: ", len(chosenword)) time.sleep(1) message.send_keys(Keys.RETURN) lettersleft = len(chosenword) print(chosenword) for windex in range(len(chosenword)): for windexer in range(5): if chosenword[windex] == vowels[windexer]: message.send_keys("Letter number ", windex + 1, " is the letter ", vowels[windexer])
soundexWord = name[0].upper() + addList else: appendList = finalVal while len(appendList) < 3: appendList.append('0') addList = ''.join(appendList) soundexWord = name[0].upper() + addList return soundexWord if __name__ == '__main__': #maxMatch Algorithm wordlist = words.words() sentence = open('barack.txt', 'r').read() appStart = 'that you work' appEnd = 'respect' barK = re.search(r'(?<= that you work)(.*)(?=respect)', sentence).group(1) sentence = appStart + barK + appEnd sentence = re.sub(r'[^\w]', ' ', sentence) sentence = sentence.replace(" ", "") result = [] maxMatch(sentence, wordlist, result) strResult = ''.join(str(w + ' ') for w in result) print("maxMatch Output=") print(strResult) #Soundex Algorithm amePresident = ['donald', 'kamala', 'hillary']
import random from nltk.corpus import words word_list = words.words() # Making a list of all the words using nltk package # Method to generate a word for the game def getWord(): word = random.choice(word_list) return word.upper() # Method to start the game def play(word): wordCompletion = "_" * len(word) guessed = False guessedLetters = [] guessedWords = [] tries = 6 print("Let's play Hangman!") print(displayHangman(tries)) print(wordCompletion) print("\n") while not guessed and tries > 0: guess = input("Please guess a letter or a word").upper() if (len(guess) == 1 and guess.isalpha()): if (guess in guessedLetters): # The letter is already guessed print("You have already guessed this letter: ", guess) elif (guess not in word): # The letter is not present in the answer print(guess, " is not in word.") tries -= 1 # Reducing the number of tries
def home(): # General formating and layout # Copywritting, storystelling and formatting st.markdown( """ # The #1 Spam Detector Online Email Spam Detection Using Python & Machine Learning """ ) st.markdown( """ ## Zero Risk does not exist """) text=""" Spear-phishing attacks playing on people's fears related to Coronavirus increased by **667%** between February and June 2020 """ image = Image.open('covid.jpg') st.image(image, use_column_width=True) st.markdown(""" There are many reasons in today's environment to be wary of email that seems in any way suspicious. Some email messages might be phishing scams; some might contain viruses and other malicious software. """) st.markdown(""" According to ANSII (National Information Systems Security Agency), phishing aims at making the recipient of an apparently legitimate email send his bank data or login credentials to financial services, in order to steal money from him. In addition to phishing, spam emails can also take the form of advertising or a scam also known as "Nigerian Scam. """) st.markdown(""" Coronavirus spear phishing email attacks detected by Barracuda Networks increased from 137 in January 2020 to more than 9,116 in the first 23 days of March 2020. """) st.markdown(""" Source: [blog.barracuda](https://blog.barracuda.com/2020/03/26/threat-spotlight-coronavirus-related-phishing/) """) st.markdown( """ ## Unfortunately, current spam filters are not 100% efficient Spam detectors included in free email services (Gmail, Outloook, AOL, GMX, etc.) are not 100% efficient. Why ? Because they only focus on the header analysis. That's a mistake. It's been a decade experts agree that NLP in machine learning is one of the most powerful tool to detect suspicious emails. """ ) st.markdown(""" That's why we focus here on both body- and subject content. Also we include the issuer address and the content type in our analysis since they provide significant information in email analysis. """) st.markdown(""" ### Almost there ! You just need to follow the following steps and you will get the results within a few seconds. Let's' get started !""" ) st.info("Since we trained our machine learning model on english emails, the spam detector only works with english content.") # Beginning of interractions with user X_content=st.text_area("1) Paste body content:","Enter a message...") X_content=str(X_content) X_subject=st.text_area("2) Paste subject content:","Enter a message...") X_subject=str(X_subject) X_from=st.text_input("3) Paste issuer address (FROM):","Enter an address...") X_from=str(X_from) if X_content=='Enter a message...': X_content=None if X_subject=='Enter a message...': X_subject=None if X_from=='Enter an address...': X_from=None options = st.selectbox('4) Select Content-Type:', ('plaintext', 'html')) if options=='plaintext': X_type_html=0 X_type_plain=1 else: X_type_html=1 X_type_plain=0 # Start analysis only if the user ckecks the 'Analyze' button and filled in all text areas if st.button('Analyze'): try: #### Creation of numeric features #### #nb of words count_body = len(re.findall(r'\w+', X_content)) count_subject= len(re.findall(r'\w+', X_subject)) #list of words words_body=re.findall(r'\w+', X_content) words_subject=re.findall(r'\w+', X_subject) #length of message and subject body_len=len(X_content) subject_len=len(X_subject) #nb of upper words for word in words_body: if len(word)>1: upper_words_body_nb = sum(1 for c in words_body if c.isupper()) for word in words_subject: if len(word)>1: upper_words_subject_nb = sum(1 for c in words_subject if c.isupper()) #nb of special characters liste_ponct = ['€','!','"','#','$','%','&','(',')','*','+',',','-','.','/',':',';','<','=','>','?','@','[',']','^','_','`','{','|','}','~'] temp = sum(1 for c in X_content if c in liste_ponct) if temp > 0: symb_body_nb=temp else: symb_body_nb=0 temp = sum(1 for c in X_subject if c in liste_ponct) if temp > 0: symb_subject_nb=temp else: symb_subject_nb=0 #nb of http url r=re.compile(r"http?://[a-zA-z0-9./-]+") http_body_nb = len(re.findall(r, X_content)) # space nb + ratio (nb_space/message_len) space_body_nb=0 space_body = sum(1 for space in X_content if space in ' ') if space_body > 0 : space_body_nb=space_body for word in words_body: if len(word)>1: space_body_ratio=space_body_nb/body_len #presence of a figure in issuer address (i.e From field) if '0' in X_from: n = 1 elif '1' in X_from: n=1 elif '2' in X_from: n=1 elif '3' in X_from: n=1 elif '4' in X_from: n=1 elif '5' in X_from: n=1 elif '6' in X_from: n=1 elif '7' in X_from: n=1 elif '8' in X_from: n=1 elif '9' in X_from: n=1 else: n=0 figure_issuer_address=n #nb of words not in dictionary (english) (we refer to a dictionnary composed of 236736 words) from nltk.corpus import words word_list_en = set(words.words()) count_body_dico=0 for word in words_body: if word not in word_list_en: count_body_dico+=1 not_dico_body=count_body_dico count_subject_dico=0 for word in words_subject: if word not in word_list_en: count_subject_dico+=1 not_dico_subject=count_subject_dico #nb of sexual adults words adult_words=['sex','sexual','erotic','p**n','xxx','adult','sexy'] count_adult_body=0 for word in words_body: if word in adult_words: count_adult_body+=1 adult_words_body=count_adult_body count_adult_subject=0 for word in words_subject: if word in adult_words: count_adult_subject+=1 adult_words_subject=count_adult_subject # creation of dataframe s1=pd.Series({'words_body_nb':count_body, 'body_len':body_len,'upper_words_body_nb':upper_words_body_nb, 'body_content':X_content,'http_body_nb':http_body_nb, 'symb_body_nb':symb_body_nb, 'text/html':X_type_html,'text/plain':X_type_plain, 'upper_words_subject_nb':upper_words_subject_nb,'symb_subject_nb':symb_subject_nb, 'words_subject_nb':count_subject,'subject_len':subject_len, 'subject_len':subject_len,'adult_words_body':adult_words_body, 'adult_words_subject':adult_words_subject,'not_dico_body':not_dico_body, 'not_dico_subject':not_dico_subject,'figure_issuer_address':figure_issuer_address, 'space_body_nb':space_body_nb,'space_body_ratio':space_body_ratio, 'subject_content':X_subject }) df1=pd.DataFrame([s1]) df_body=df1.copy() df_subject=df1.copy() # ***** Prediction of spam/ham ***** # Loading of joblib objects already fitted with thousands of data model =load("rf.joblib") vectorizer_body = load("vectorizer_body.joblib") vectorizer_subject = load("vectorizer_subject.joblib") # Tokenization of text contents X_content_nlp = vectorizer_body.transform([X_content]) X_subject_nlp = vectorizer_subject.transform([X_subject]) X_nlp_test3 = hstack((X_content_nlp,X_subject_nlp, df1.drop(columns=['body_content','subject_content']).values)) # Start prediction pred=model.predict(X_nlp_test3) if pred==1: st.markdown("""## **It might be considered as spam** """) style_spam = ''' <style> h2>strong { color : #b74355 ; } </style> ''' st.markdown(style_spam, unsafe_allow_html=True) else: st.markdown("""## **It might be considered as ham** """) style_spam = ''' <style> h2>strong { color : #99d594 ; } </style> ''' st.markdown(style_spam, unsafe_allow_html=True) tabs = bokeh.models.Tabs( tabs=[ body_analysis_panel(df_body),subject_analysis_panel(df_subject) ] ) st.markdown(""" ### Want To Learn More About The Results? """) st.bokeh_chart(tabs) # Start the analysis only if the user filled in all text areas except TypeError: st.write("First you need to fill in the text areas")
def random_text_corpus(): corpus = pd.DataFrame([(i, ' '.join(random.sample(words.words(), 20))) for i in range(100)], columns=["@id", "desc"]) return corpus
def checkWords(string): return string in words.words() or len(string) > 5
def get_word(): word_list = words.words() number = random.randint(0, len(word_list) - 1) return word_list[number]
def filter_data(data, removeEng=False): # Given a data object remove any transcriptons with undesirable features to_remove = string.punctuation + "…’“–”‘°" # Any words you want to ignore special_cases = ["<silence>"] translation_tags = set(['@eng@', '<ind:', '<eng:']) cleaned_data = [] if removeEng: use_langid = False if use_langid: from langid.langid import LanguageIdentifier, model identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) from nltk.corpus import words eng_words = set(words.words()) # Using both 2.16% english in wordlist 14.6k words(slow) # Using nltk dictionary 2.49% englisb in wordlist 15.5k words (fast) # Using neither 11.1% english in wordlist 34.8k words (fast) # Only words > 3 chars are counted, for audio-segment sample # Using remove english and ignore after '<' 1.8% 20.4K for utt in data: # print(utt, file=sys.stderr) trans = utt.get('transcript').lower() words = trans.split() # Note this is an assumption only translations come after '<' # if "<" in trans: # r = re.search(r'[<]@?(eng|indo|ind|mala)', trans) # if bool(r): # words = trans[:r.span()[0]].split() clean_words = [] valid_utterance = True eng_count = 0 for word in words: # If a special case ignore if word in special_cases: continue # If utterance contains a translation if word in translation_tags: # Translations / ignore break # If partial digit, throw out whole utterance if bool(re.search(r'\d', word)) and not word.isdigit(): valid_utterance = False break # Remove punctuation and bad chars for char in to_remove: word = word.replace(char, '') # If word is in english dictionary count it if removeEng and len(word) > 3 and word in eng_words: # print(word, file=sys.stderr) eng_count += 1 clean_words.append(word) # Exclude utterance if empty after cleaning cleaned_trans = ' '.join(clean_words).strip() if cleaned_trans == "": valid_utterance = False # Exclude utterance if > 10% english if removeEng and len( clean_words) > 0 and eng_count / len(clean_words) > 0.1: # print(round(eng_count / len(clean_words)), trans, file=sys.stderr) valid_utterance = False # Exclude utterance if langid thinks its english if removeEng and use_langid and valid_utterance: lang, prob = identifier.classify(cleaned_trans) if lang == 'en' and prob > 0.5: valid_utterance = False # Something was bad in utterance if not valid_utterance: continue # Should be a clean valid utterance utt['transcript'] = cleaned_trans cleaned_data.append(utt) return cleaned_data
global letters_dicts for i in alphabet: letters_dict.__setitem__(i, 0) letters_dict.__setitem__('`', -1) alpha_dict() number_of_letters = int(input("How many letters are in your word?: ")) for i in range(number_of_letters): show_list.append("_") number_of_attempts = int(input("How many attempts do you give the computer?: ")) word_list = word_list + w.words() for i in word_list: if len(i) == number_of_letters: candidates.append(i) def show_user(): global show_list update_show_list() print(" ".join(show_list)) def check_for_best_word(): global candidates global common_word_list for i in range(len(common_word_list)): for j in range(len(candidates)): if common_word_list[i] == candidates[j]:
from textblob import TextBlob from nltk.corpus import stopwords, words stops = set(stopwords.words()) english_words = set(w.lower() for w in words.words()) def preprocess(corpus): blob = TextBlob(corpus) ret = [] for sent in blob.sentences: sent_list = [] for word in sent.words.lower().lemmatize(): if word in english_words and word not in stops: sent_list.append(word) ret.append(sent_list) return ret def tfidf_preprocess(corpus): blob = TextBlob(corpus) ret = [] for word in blob.words.lower(): if word in english_words and word not in stops: ret.append(word) return ' '.join(ret)
from functions import get_filename_list import json from nltk import word_tokenize from nltk.stem import PorterStemmer from nltk.corpus import stopwords, words import pickle def stemm_paragraph(paragraph): return [ps.stem(word) for word in word_tokenize(paragraph)] ps = PorterStemmer() stop_words = stopwords.words('english') valid_words = set([ps.stem(x) for x in words.words()]) index = {} for k, filename in enumerate(get_filename_list()): print(k, filename) json_file = json.loads(open(filename).read()) paragraphs = [x['text'] for x in json_file['body_text']] for i, paragraph in enumerate(paragraphs): stemmed_paragraph_tokens = stemm_paragraph(paragraph) for token in stemmed_paragraph_tokens:
import argparse import os import json from spacy.lang.en import stop_words import twikenizer as twk import re import string from nltk.corpus import words spacy_stopwords = stop_words.STOP_WORDS tokenizer = twk.Twikenizer() word_dictionary = list(set(words.words())) for alphabet in "bcdefghjklmnopqrstuvwxyz": word_dictionary.remove(alphabet) def append_json(json_data, file_path): """ Write json to file :param json_data: Json to write :param file_path: Path of the file """ with open(file_path, 'a+') as f: json.dump(json_data, f) f.write("\n") def split_hashtag_to_words_all_possibilities(hashtag): all_possibilities = [] split_posibility = [hashtag[:i] in word_dictionary for i in reversed(range(len(hashtag)+1))]
def random_words(): corpus = list(random.sample(words.words(), 700)) return corpus
import time import pandas as pd import re import argparse from json import JSONDecoder, JSONDecodeError import io import nltk nltk.download('stopwords') nltk.download('punkt') nltk.download('words') from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.corpus import words stop_words = set(stopwords.words('english')) stopword_set = set(stopwords.words()) word_set = set(words.words()) from sklearn.feature_extraction.text import TfidfVectorizer import seaborn as sns import matplotlib.pyplot as plt plt.switch_backend('Agg') ### set up plotting parameters from matplotlib import rcParams plt.style.use('seaborn-poster') plt.rcParams['font.family'] = 'serif' rcParams['font.sans-serif'] = ['Palatino'] rcParams['figure.max_open_warning'] = 30 ### This saves the credentials KIDS = config.KIDS CUR_FILE_DIR = os.path.dirname(os.path.realpath(__file__))
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] deletes = [L + R[1:] for L, R in splits if R] transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] replaces = [L + c + R[1:] for L, R in splits if R for c in letters] inserts = [L + c + R for L, R in splits for c in letters] return set(deletes + transposes + replaces + inserts) def edits2(word): "All edits that are two edits away from `word`." return (e2 for e1 in edits1(word) for e2 in edits1(e1)) #%% #Custom tagging each word english_vocab = set(w.lower() for w in wd.words()) stop_words = stopwords.words('english') #wn.synsets('motorcar') def CustomTag(word): if word.isnumeric(): return "Num" elif word.isalnum() and not word.isnumeric() and not word.isalpha(): return "Mixed" elif word in punctuation: return "Pun" elif word in english_vocab: #wn.synsets(word): return "Known" elif word in stop_words:
nlp = spacy.load('en_core_web_lg') def get_lemma(word): if len(word.split()) != 1: raise ValueError('Only input >>>single<<< words') doc = nlp(word) lemma = doc[0].lemma_ return (lemma) recommended = load_lexicon('recommended') # Get NLTK corpus print('Lemmatizing NLTK') words = words.words() lemmas = [get_lemma(word) for word in tqdm(words, desc='NLTK lemmas')] lemmas = set(lemmas) print('Lemmatizing SNOMED') recommended_lemmas = [] for syn in tqdm(recommended, desc='SNOMED CT lemmas'): doc = nlp(syn.term) for token in doc: recommended_lemmas.append(token.lemma_) recommended_lemmas = set(recommended_lemmas) remove_list = list(lemmas - recommended_lemmas) with open("dict_strings.csv", 'w', newline='') as resultFile:
if "http" not in line: current_category = line continue else: url = line con = urllib2.urlopen(url, timeout=60) html = con.read() soup = BeautifulSoup(html) page = soup.findAll('p') #.getText() word_data = {} for i in page: word_list = i.getText().split(" ") for word in word_list: real_word = re.sub('[^a-zA-Z]', '', word) print current_category + "..checking.." + real_word if real_word not in words.words() or real_word == "": continue if real_word not in word_data: word_data[real_word] = {} if current_category not in word_data[real_word]: word_data[real_word][current_category] = 0 else: word_data[real_word][current_category] += 1 # write to csv file with open("result.csv", 'wb') as file: a = csv.writer(file) for word in word_data: entry = [] entry.append(word)
def _get_words(self): from nltk.corpus import words return words.words()
from nltk.tokenize import wordpunct_tokenize from nltk.probability import FreqDist from nltk.stem.snowball import SnowballStemmer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth from sklearn.manifold import MDS from scipy.cluster.hierarchy import ward, dendrogram from gensim import corpora, models # filter stopwords stopwords = set(stopwords.words('english')) en_words = set(words.words()) stemmer = SnowballStemmer('english') book_id = [ '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24' ] titles = [ 'THE HISTORY OF THE DECLINE AND FALL OF THE ROMAN EMPIRE. VOL. VI', 'THE HISTORIES CAIUS COBNELIUS TACITUS', 'THE WORK OF JOSEPH US, THE JEWISH WAR. VOL. IV', 'THE HISTORY OF THE DECLINE AND FALL OF THE ROMAN EMPIRE. VOL, I', 'THE HISTORY OF TACITUS. BOOK I. VOL. V', 'THE FIRST AND THIRTY-THIRD BOOKS OF PLINY\'S NATURAL HISTORY', 'THE HISTORY OF THE ROMAN EMPIRE. VOL. V', 'THE HISTORY OF THE DECLINE AND FALL OF THE ROMAN EMPIRE. VOL. II', 'THE HISTORY OF THE PELOPONNESIAN WAR. VOL. II',
def extract_runtime(runtime_str): runtime = 0 elems = runtime_str.split(' ') if len(elems) > 1: runtime += int(elems[0]) * 24 * 60 * 60 t = elems[1] else: t = elems[0] telems = t.split(':') runtime += int(telems[0]) * 60 * 60 + int(telems[1]) * 60 + int(telems[2]) return runtime extract_runtime_udf = udf(extract_runtime, StringType()) nltk.download('words') english_words = words.words() salt = str(random.SystemRandom().random()) def anonymize(s): random.seed(str(s) + salt) return random.choice(english_words) anonymize_udf = udf(anonymize, StringType())
from nltk.corpus import words setofwords = set(words.words()) def dict_check(list): valid_words = [] for l in list: if l in setofwords: valid_words.append(l) return valid_words
def create_base_network(input_shape): input = Input(shape=input_shape) x = ZeroPadding2D(padding=(0, 1), data_format='channels_last')(input) x = Conv2D(64, (27, 3), activation='relu')(x) x = Flatten()(x) x = Dense(1024, activation='relu')(x) return Model(input, x) def binary_accuracy(output_true, output_pred): return K.mean(K.equal(output_true, K.round(output_pred)), axis=-1) words_list = words.words() book = nltk.corpus.gutenberg.words(u'austen-persuasion.txt') book_text = nltk.Text(book) words_list2 = book_text.tokens alphabet = [] for letter in range(97, 123): alphabet.append(chr(letter)) words_to_train1 = w.create_wordlist(words_list, words_number) words_to_train2 = w.create_wordlist(words_list, words_number) words_to_val1 = w.create_wordlist(words_list2, words_number) words_to_val2 = w.create_wordlist(words_list2, words_number) labels_train = w.create_labels(words_to_train1, words_to_train2, words_number) labels_val = w.create_labels(words_to_val1, words_to_val2, words_number)
def is_word(s: str) -> bool: return s.lower() in words.words()
# 11-June-2020 | Tapas Mohanty | Jahar Sil and Abhisek Kumar | Initial Release # ========================================================================================================================= # ========================================================================================================================= # Import required Module / Packages # ------------------------------------------------------------------------------------------------------------------------- import nltk import re from bs4 import BeautifulSoup import unicodedata from contractions import CONTRACTION_MAP from nltk.corpus import wordnet from nltk.tokenize.toktok import ToktokTokenizer import en_core_web_sm from nltk.corpus import words engwords = words.words() import traceback ########################################################################################################################### # Author : Tapas Mohanty # Co-Author : Jahar Sil and Tapas Mohanty # Modified : # Reviewer : # Functionality : Tokenizing the keywords ########################################################################################################################### tokenizer = ToktokTokenizer() stopword_list = nltk.corpus.stopwords.words('english') # nlp = spacy.load('en', parse=True, tag=True, entity=True) nlp = en_core_web_sm.load() # nlp_vec = spacy.load('en_vectors_web_lg', parse=True, tag=True, entity=True)
from siteswapClass import siteswap import nltk from nltk.corpus import words ##nltk.download() for i in range(0, 10000): if siteswap(words.words()[i]).isValid(): print(words.words()[i])
def wordUB(self,word): if word not in words.words() and word in self.ud: return(True) else: return(False)
import re from nltk.corpus import stopwords, words from tools.utils import save_and_reload_df from tools.data_handling import enrich_emails, unique_recipients, address_book from tools.features import stem stopwords = set(stopwords.words("english")) english_words = set(words.words()) def remove_after_indicator(text, indicator): '''Removes everything in text after indicator if found. If not found, leaves text as is. Arguments: - text (str): the text you want to shorten. - indicator (str): the indicator after which you want to cut the text. Output: - str: the shortened text. ''' indic_match = re.search(indicator, text) if indic_match: simple_text = text[:indic_match.span(0)[0]] else: simple_text = text return simple_text def remove_punctuation(text):
from gensim.corpora import Dictionary from tqdm import tqdm from sklearn.metrics.pairwise import cosine_distances tqdm.pandas() ########################################################################## # Word Resources ########################################################################## nltk.download('punkt') nltk.download('words') nltk.download('stopwords') some_other_words = ['your exclude words'] Englishtext = set(W.words()) my_stopwords = {'able'} stop_words = set([word for word in stopwords.words('english')]) stop_words = stop_words.union(STOPWORDS).union(STOP_WORDS).union(my_stopwords) ########################################################################## # Data Cleaning functions for projects SOW ########################################################################## ''' This function expect no nan in the inout datafram, so replace nan with empty string projectSowData is a pandas dataframe where the columns need to include projectID and StatementOfWork it will return a pandas df and a dictionary, the df is the filtered dataframe
import numpy as np import linecache from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter from pdfminer.layout import LAParams import io from rake_nltk import Rake from nltk.corpus import words import string from IPython.display import display_html from itertools import chain,cycle my_exclusions = ['--', '–', 'i', 'ii', 'iii', 'iv', 'v', 'vi', 'okay', 'et', 'cetera'] exclusion_list = words.words() + my_exclusions def display_side_by_side(*args,titles=cycle([''])): ''' Pulled from stackoverflow by @ntg Displays dataframes side by side and allows setting titles for each dataframe ''' html_str='' for df,title in zip(args, chain(titles,cycle(['</br>'])) ): html_str+='<th style="text-align:center"><td style="vertical-align:top">' html_str+=f'<h2>{title}</h2>' html_str+=df.to_html().replace('table','table style="display:inline"') html_str+='</td></th>' display_html(html_str,raw=True) return None def pdfparser(data):