예제 #1
0
파일: pyesol.py 프로젝트: brythonick/pyesol
 def add_sample(self, sample):
     if not isinstance(sample, str):
         raise TypeError
     # Calling add_sample should replace existing sample.
     # To avoid appending new values onto existing lists:
     self.sample = sample
     self.misspelled_words = []
     self.tokenized_sample = []
     self.tagged_sample = {}
     sample = sample.replace('\n', " ")
     sample = sample.rstrip(" ")
     for char in punctuation.replace("'", ""):
         sample = sample.replace(char, "")
     tokens = word_tokenize(sample)
     for word in tokens:
         if word.lower() in words.words():
             self.tokenized_sample.append(word)
         elif word.capitalize() in names.words():
             continue
         elif "'" in word:
             self.tokenized_sample.append(word)
         elif LEMMATIZER.lemmatize(word.lower()) not in words.words():
             if STEMMER.stem(word.lower()) not in words.words():
                 self.misspelled_words.append(word)
         else:
             self.tokenized_sample.append(word)
     self.tagged_sample = pos_tag(tokens)
예제 #2
0
def divide(s):
	first = ''
	for i in range(len(str(s))):
		first += s[i]
		print first
		if first in words.words() and s[i + 1:] in words.words():
			return ' '.join([first, s[i + 1:]])
	return False
예제 #3
0
def raw_files_to_labeled_features(raw_files, label_file):
    # Initialize spark
    conf = SparkConf().setAppName("SpamFilter").setMaster("local[*]")
    sc = SparkContext(conf=conf)

    # Get the set of words that we will be accepting as valid features
    valid_words = set(w.lower() for w in words.words())

    # Load training data and convert to our desired format
    raw_files = sc.wholeTextFiles(raw_files)

    # Extract a document of filtered words from each text file
    documents = raw_files.map(lambda x: (x[0], extract_words(x[1], valid_words)))

    # Calculate TF-IDF values for each document
    tfidf = calculate_tfidf(documents)

    # Load labels
    labels = sc.parallelize(load_labels(label_file)).map(lambda x: x[0])

    # Append indexes to features and labels
    indexed_labels = labels.zipWithIndex().map(lambda x: (x[1],x[0]))
    indexed_features = tfidf.zipWithIndex().map(lambda x: (x[1],x[0]))

    # Join labels and features into tuples and return
    return indexed_labels.join(indexed_features).map(lambda x: x[1]).collect()
 def _english_wordlist(self):
     try:
         wl = self._en_wordlist
     except AttributeError:
         from nltk.corpus import words
         wl = self._en_wordlist = set(words.words('en-basic'))
     return wl
예제 #5
0
def get_vocab():
    word_list = words.words()
    lowercased = [t.lower() for t in word_list]
    STEMMER = PorterStemmer()
    stemmed = [STEMMER.stem(w) for w in lowercased]
    vocab = list(set(stemmed))
    return vocab
예제 #6
0
파일: okreader.py 프로젝트: ned2/okdata
def get_english_vocab(lemmatize=False):
    vocab = (w.lower() for w in words.words())

    if lemmatize:
        stemmer = PorterStemmer()
        vocab = (stemmer.stem(w) for w in vocab)
    return set(vocab)
 def __init__(self):
     self.stopwords = stopwords.words('english')
     self.uscities = set([w.lower() for w in gazetteers.words('uscities.txt')])
     self.usstates = set([w.lower() for w in gazetteers.words('usstates.txt')])
     self.countries = set([w.lower() for w in gazetteers.words('countries.txt')])
     self.basicwords = set(words.words('en-basic'))
     self.paragraph_tokens = []
     self.texts = []
예제 #8
0
def unknown(list):

    k = re.findall(r'(?<= )+[a-z]+\b', textString)       # Removes punctuation and capitalized words
    print(textString)
    for w in k:                                          # Gets all the words
        if(w not in words.words()):                      # If  website words arent in NLTK word dictionary:
            unW.append(w)                                # Adds the word to the unknown list
    print (unW)                                          # Prints words that are not in the NLTK word dictionary
예제 #9
0
 def __init__(self, dict_path = '/etc/dictionaries-common/words'):
      f = open(dict_path)
      
      # We use two dictionaries for better coverage
      d1 = set([w.lower() for w in f.read().split()])
      d2 = set([w.lower() for w in words.words()])
      
      self.words = set(d1.union(d2))
예제 #10
0
def unknown(url):
    # get the HTML, as a string
    html = str(bs(urllib.urlopen(url).read()))
    # find all substrings
    substrings = set(re.findall(r'[a-z]+', html))
    # specify the wordlist
    wordlist = words.words()
    # return the words not in the wordlist
    return [word for word in substrings if word not in wordlist]
예제 #11
0
    def __init__(self,
                 corpora_list=['all_plaintext.txt', 'big.txt'],
                 parse_args=(True, True, True, True, True)):

        #Set the parsing arguments
        self.remove_stopwords = parse_args[0]
        self.tag_numeric = parse_args[1]
        self.correct_spelling = parse_args[2]
        self.kill_nonwords = parse_args[3]
        self.stem = parse_args[4]

        #Alphabet
        self.alphabet = 'abcdefghijklmnopqrstuvwxyz'

        #Punctuation
        self.punc_dict = {ord(c): None for c in string.punctuation}

        #Reserved tags
        self.reserved_tags = ['numeric_type_hex',
                              'numeric_type_binary',
                              'numeric_type_octal',
                              'numeric_type_float',
                              'numeric_type_int',
                              'numeric_type_complex',
                              'numeric_type_roman',
                              'math_type']

        #Update the set of nltk words with the additional corpora
        self.all_words = set(words.words())
        self.all_words.update('a')
        self.all_words.update('i')
        self.all_words.update(self.reserved_tags)
        self.max_word_length = 20

        #Set up the stopwords, remove 'a' due to math issues
        self.stops = set(stopwords.words("english"))
        self.stops.remove('a')
        self.stops.remove('no')

        #Set up the stemmer
        self.st = SnowballStemmer('english')

        #Train the spelling corrector using all corpora
        train_text = ''
        for cfile in corpora_list:
            words_in_file = file(cfile).read()
            self.all_words.update(self.get_all_words(file(cfile).read()))
            train_text = train_text + words_in_file

        #Remove single character terms
        wordlist = list(self.all_words)
        wordlist = [i for i in wordlist if len(i) > 1]
        self.all_words = set(wordlist)
        self.all_words.update('a')
        self.all_words.update('i')

        self.NWORDS = self.train(self.get_all_words(train_text))
예제 #12
0
def tokenize4(text):
	wordnet_lemmatizer = WordNetLemmatizer()
	tokens             = word_tokenize(text)
	wordset            = set(words.words())
	tokens             = [wordnet_lemmatizer.lemmatize(token, NOUN) for token in tokens]
	tokens             = [wordnet_lemmatizer.lemmatize(token, VERB) for token in tokens]
	tokens             = [wordnet_lemmatizer.lemmatize(token, ADJ) for token in tokens]
	tokens             = [token for token in tokens if token in wordset]
	return tokens
예제 #13
0
def extractingFromFolders():
    folder2 = os.path.expanduser('~\\My Documents\\Tara\\Ongoing\\CharacterCorpus\\Reference')
    fileresult = os.path.expanduser('~\\My Documents\\Tara\\Ongoing\\CharacterCorpus\\results.txt')
    refer = PlaintextCorpusReader(folder2, 'harrygrepster.txt')
    grepster = refer.words()
    results = open(fileresult, 'a')
    completeWords = wordlist.words()
    stoppers = stopwords.words()
    return grepster, results, completeWords, stoppers
예제 #14
0
def unknown(url):
    # get the HTML, as a string
    html = str(bs(urllib.urlopen(url).read()))
    # find all substrings
    substrings = set(re.findall(r"\s+([a-zA-Z]+)\s+", html))
    substrings = [word.lower() for word in substrings]
    # specify the wordlist
    wordlist = words.words()
    # return the words not in the wordlist
    return [word for word in substrings if word not in wordlist]
예제 #15
0
def textParse(file):
    processedText = ''
    with open(file, 'r') as f:
        lines = f.read().splitlines()
        for line in lines:
            wordsInLine = line.split(' ')
            for word in wordsInLine:
                # print '*'+word+'*'
                if word.lower() in words.words():
                    processedText += word + ' '
    return processedText
예제 #16
0
def getReadabilityScore(tweet):
    w1 = tweet.split(" ")
    ASL1 = len(w1)
    AOV1 = 0
    l = 0
    for w in w1:
        l+=len(w)
        if(w not in words.words()):
            AOV1+=1
    ASW1 = l/float(ASL1)
    S1 = 206.835 - (1.015*ASL1) - (84.6*ASW1)- (10.5*AOV1)
    return S1
예제 #17
0
def exercise_unusual_words():
    text = gutenberg.words("austen-sense.txt")

    # 取出文本中的词汇, 去除数字, 转换为小写
    text_vocab = set(w.lower() for w in text if w.isalpha())

    # 取出词典中的词汇
    english_vocab = set(w.lower() for w in words.words())

    # 找出文本中的非常用词汇(错误词汇)
    unusual_vocab = text_vocab.difference(english_vocab)

    print sorted(unusual_vocab)
예제 #18
0
def anagrams_for(word):
  # TODO: 
  # 1. generate permutations of word array (note: don't return itself)
  # 2. check if word is a real word using the syntax below:
  # if "word" in words.words():
  #  print word
  # example using itertools: list(itertools.permutations([1,2,3,4], 2))
  word_array = array('u', word)
  [print new_word.tobytes() 
    for new_word in 
      list(itertools.permutations(word_array, word_array.length)) 
      if new_word in 
        words.words())]
def precomputeFromNLTK():
    """
    precompute with nltk's corpus as wordbase
    """
    language = set()
    print(len(words.words()))
    for word in words.words():
        word = word.lower()
        sortW = "".join(char for char in sorted(word))
        if sortW[0] >= "a" and sortW[0] <= "z":
            word = word + ":" + sortW
            language.add(word)
    print("Loaded %d words from NLTK wordnet" % (len(language)))
    buckets = [set() for x in xrange(25)]
    for word in language:
        buckets[len(word) / 2].add(word)
    count = 0
    for word in language:
        if count % 1000 == 0:
            print("Done for %d words" % count)
        count += 1
        sortedW = word.split(":")[1]
        if sortedW not in nltkHashMap:
            nltkHashMap[sortedW] = set()
            for word2 in buckets[len(sortedW)]:
                sortedW2 = word2.split(":")[1]
                if sortedW == sortedW2:
                    nltkHashMap[sortedW].add(word2.split(":")[0])
    file = open(nltkAnagramsFile, "w")
    file.truncate()
    count = 0
    for anagrams, listOfAnagrams in nltkHashMap.items():
        if count % 1000 == 0:
            print("%d anagram lists written" % count)
            file.flush()
        count += 1
        file.write("%s:%s\n" % (anagrams, listOfAnagrams))
    file.close()
    print("Precomputation with NLTK done")
예제 #20
0
def unknown(url):
	"""Takes a URL as its argument and returns a list of unknown words that occur on that webpage."""
	
	# gets the text of the page
	html = request.urlopen(url).read().decode('utf8')
	raw = BeautifulSoup(html).get_text()
	junk = set(words.words())
	# finds the lower case words by searching for a word boundary plus one or more lower case letters
	lower_case_words = re.findall(r'\b[a-z]+', raw)

	# searches through the list of lower case words and gets rid of those not in the words corpus.
	unknowns = [word for word in lower_case_words if word not in junk]
	print(unknowns)
예제 #21
0
 def __init__(self,conf):
     self.translation = conf.translation.translate
     self.translation_store = conf.translation.store
     self.translate_threshold = conf.translation.threshold
     self.translate_failed = False
     self.translate_status = False
     self.text = ""
     self.source = ""
     self.tokens= []
     self.tweet_id = ""
     if self.translation:
         self.english_vocab = set(w.lower() for w in words.words())
     self.stopfile = conf.stopwords.filename
     self.stopset = set(open(self.stopfile, 'r').read().split())
예제 #22
0
    def __init__(self):

        self.dict_anagrams = {}

        for t in words.words():
            word = str.lower(str(t))
            word = word.replace('-',' ')

            alpha_count = get_alphaCount(word)

            if alpha_count in self.dict_anagrams:
                self.dict_anagrams[alpha_count].add(word)
            else:
                self.dict_anagrams[alpha_count] = {word}
예제 #23
0
def main():
    """ Let's go through the words and return a list of nick names for
    Diane Stitt .... yes! """

    stitt_dict = {}
    for word in words.words('en'):
        # find a word that includes two t's 
        if find_pair(word):
            data_structure(add_stitt(find_pair(word)), # Add Stitt to word
                           word, # include original word for comparison
                           stitt_dict) # use a dict to group nickname dups
            
    # return a dictionary where each key is a program generated nickname
    # mapped to a list of all words that generate that nickname
    return stitt_dict
def baselineCorrector(word):
    arr = sorted(words.words())
    if word in arr:
        return word
    low = 0
    high = len(arr) - 1
    while True:
        mid = (low + high) / 2
        midval = arr[mid]
        if high <= low:
            return arr[mid+1] if midval < word else midval
        if midval < word:
            low = mid + 1
        elif midval > word:
            high = mid - 1
예제 #25
0
def build_dictionary_index():
	global dict_index
	s = set(words.words('en'))
	for word in s:
		if len(word) > 2: # Ignore smallest words
			word = str.lower(word)
			try:
				dict_index[word[0]]
				try:
					dict_index[word[0]][len(word)] += [word]
				except KeyError:
					dict_index[word[0]][len(word)] = [word]
				except TypeError:
					print dict_index[word[0]][len(word)]
			except KeyError:
				dict_index[word[0]] = {len(word):[word]}
def DeleteDictWords(data,dictEN, dictFR, french_dict_path):
    
    if dictEN == 1:
        english_dict = words.words('en')
    else:
        english_dict=[]
        
    if dictFR == 1:
        french_dict = [line.rstrip('\n') for line in open(french_dict_path, 'r'
                                                            ,encoding='utf-8')]   
    else : 
        french_dict = []
    
    dico = english_dict + french_dict
    col_dico = [i for i in data.columns.values if i in dico]
      
    return pd.DataFrame(data, columns = col_dico)            
예제 #27
0
def word_finder():
    from nltk.corpus import words
    wordlist = words.words()
    random.shuffle(wordlist)
    wordlist = wordlist[:200]
    wordlist = [w for w in wordlist if 3 <= len(w) <= 12]
    grid, used = wordfinder(wordlist)

    print("Word Finder\n")
    for i in range(len(grid)):
        for j in range(len(grid[i])):
            print(grid[i][j], end=' ')
        print()
    print()

    for i in range(len(used)):
        print("%d:" % (i+1), used[i])
예제 #28
0
def make_index():
	s = set(words.words('en'))
	index = {}
	for word in s:
		if len(word) > 2:
			word = str.lower(word)
			try:
				index[word[0]]
				try:
					index[word[0]][len(word)] += [word]
				except KeyError:
					index[word[0]][len(word)] = [word]
				except TypeError:
					print index[word[0]][len(word)]
			except KeyError:
				index[word[0]] = {len(word):[word]}
	return index
예제 #29
0
def clean_pdf( search_file ):
	search_file = 'scraped_'+search_file+'.txt'
	file = open(search_file, 'r')
	text = file.read()
	text_only = BeautifulSoup(text).get_text()
	rm_symbol = re.sub(r'[^\w]', ' ', text_only)
	letters_only = re.sub("[^a-zA-Z]", " ", rm_symbol )
	lower_case = letters_only.lower()
	words_text = set(lower_case.split())
	stops = set(stopwords.words("english"))
	english_words = words.words()
	rm_stopwords = [w for w in words_text if not w in stops]
#	meaningful_words = [w for w in rm_stopwords if w in english_words]	# In order to select the meaningful words, we can either select those that belong to the english dictionary, or instead suppress those that belong to the dictionary, in this way only those terms specific to the subject will count.
# This made me realize, this is an awesome way of fixing typos, look for those words that DO NOT belong to the dictionary!
#	meaningful_words = words - stops
	meaningful_text = ' '.join(rm_stopwords)#meaningful_words
	return meaningful_text
예제 #30
0
def demo():
    from nltk.corpus import words

    wordlist = words.words()
    random.shuffle(wordlist)
    wordlist = wordlist[:200]
    wordlist = [w for w in wordlist if 3 <= len(w) <= 12]
    grid, used = wordfinder(wordlist)

    print "Word Finder\n"
    for i in range(len(grid)):
        for j in range(len(grid[i])):
            print grid[i][j],
        print
    print

    for i in range(len(used)):
        print "%d:" % (i + 1), used[i]
예제 #31
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

from nltk.corpus import names, stopwords, words
print(words.fileids())
print(words.words('en'))  # doctest: +ELLIPSIS
print(stopwords.fileids())  # doctest: +ELLIPSIS
print(stopwords.words('portuguese'))  # doctest: +ELLIPSIS
print(names.fileids())
print(names.words('male.txt'))  # doctest: +ELLIPSIS
print(names.words('female.txt'))  # doctest: +ELLIPSIS

from nltk.corpus import cmudict
print(cmudict.entries()[653:659])  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# Load the entire cmudict corpus into a Python dictionary:
transcr = cmudict.dict()
print([transcr[w][0] for w in 'Natural Language Tool Kit'.lower().split()])
예제 #32
0
    elif b[len(b)-2]==("/news"):
        def NewsFromBBC():
            main_url = " https://newsapi.org/v1/articles?source=bbc-news&sortBy=top&apiKey=28761f98c19940c78675c38d6babb8b7"
            open_bbc_page = requests.get(main_url).json()
            article = open_bbc_page["articles"]
            results = []
            for ar in article:
                results.append(ar["title"])
            for i in range(len(results)):
                message.send_keys(i + 1, results[i])
                message.send_keys(Keys.RETURN)
        if __name__ == '__main__':
            NewsFromBBC()

    elif b[len(b)-2]==("/hangman"):
        word_list = words.words()
        vowels = ["a", "e", "i", "o", "u"]
        message.send_keys("welcome to hangman ")
        time.sleep(1)
        message.send_keys(Keys.RETURN)
        chosenword = random.choice(word_list)
        message.send_keys("the length of the chosen word is: ", len(chosenword))
        time.sleep(1)
        message.send_keys(Keys.RETURN)
        lettersleft = len(chosenword)

        print(chosenword)
        for windex in range(len(chosenword)):
            for windexer in range(5):
                if chosenword[windex] == vowels[windexer]:
                    message.send_keys("Letter number ", windex + 1, " is the letter ", vowels[windexer])
예제 #33
0
            soundexWord = name[0].upper() + addList
        else:
            appendList = finalVal
            while len(appendList) < 3:
                appendList.append('0')
            addList = ''.join(appendList)

            soundexWord = name[0].upper() + addList

        return soundexWord


if __name__ == '__main__':

    #maxMatch Algorithm
    wordlist = words.words()
    sentence = open('barack.txt', 'r').read()
    appStart = 'that you work'
    appEnd = 'respect'
    barK = re.search(r'(?<= that you work)(.*)(?=respect)', sentence).group(1)
    sentence = appStart + barK + appEnd
    sentence = re.sub(r'[^\w]', ' ', sentence)
    sentence = sentence.replace(" ", "")
    result = []
    maxMatch(sentence, wordlist, result)
    strResult = ''.join(str(w + ' ') for w in result)
    print("maxMatch Output=")
    print(strResult)

    #Soundex Algorithm
    amePresident = ['donald', 'kamala', 'hillary']
예제 #34
0
import random
from nltk.corpus import words
word_list = words.words()  # Making a list of all the words using nltk package


# Method to generate a word for the game
def getWord():
    word = random.choice(word_list)
    return word.upper()


# Method to start the game
def play(word):
    wordCompletion = "_" * len(word)
    guessed = False
    guessedLetters = []
    guessedWords = []
    tries = 6
    print("Let's play Hangman!")
    print(displayHangman(tries))
    print(wordCompletion)
    print("\n")
    while not guessed and tries > 0:
        guess = input("Please guess a letter or a word").upper()
        if (len(guess) == 1 and guess.isalpha()):
            if (guess in guessedLetters):  # The letter is already guessed
                print("You have already guessed this letter: ", guess)
            elif (guess
                  not in word):  # The letter is not present in the answer
                print(guess, " is not in word.")
                tries -= 1  # Reducing the number of tries
def home():
    # General formating and layout
   
    # Copywritting, storystelling and formatting
    st.markdown(
            """
    # The #1 Spam Detector Online
    Email Spam Detection Using Python & Machine Learning
    
    """
        )
     
    st.markdown(
            """
    ## Zero Risk does not exist
    """)
    
    text="""
    Spear-phishing attacks playing on people's fears related to Coronavirus 
    increased by **667%** between February and June 2020
    """
    
    image = Image.open('covid.jpg')
    st.image(image, use_column_width=True)
    
    st.markdown(""" 
                There are many reasons in today's environment to be wary of email that seems in any way suspicious.
                Some email messages might be phishing scams; some might contain viruses and other malicious software.
                """)
                
    st.markdown("""
               According to ANSII (National Information Systems Security Agency), phishing aims at making the recipient of an apparently legitimate email send his bank data or login credentials to financial services, 
                in order to steal money from him. In addition to phishing, spam emails can also take the form of advertising or a scam also known as "Nigerian Scam.
                """)
                
    st.markdown("""  
                Coronavirus spear phishing email attacks detected by Barracuda Networks increased from 137 in January 2020 to more than 9,116 in the first 23 days of March 2020.
                """)
    st.markdown("""
                Source: [blog.barracuda](https://blog.barracuda.com/2020/03/26/threat-spotlight-coronavirus-related-phishing/)
                """)
    
    st.markdown(
            """
    ## Unfortunately, current spam filters are not 100% efficient
    Spam detectors included in free email services (Gmail, Outloook, AOL, GMX, etc.) are not 100% efficient.
    Why ? Because they only focus on the header analysis. That's a mistake. It's been a decade experts agree that 
    NLP in machine learning is one of the most powerful tool to detect suspicious emails. 
    
    """
        )
    
    st.markdown("""
                That's why we focus here on both body- and subject content. Also we include the issuer address and the content type in our analysis since they 
    provide significant information in email analysis.
    """)
    

    
    st.markdown("""
            ### Almost there !
            You just need to follow the following steps and you 
            will get the results within a few seconds. Let's' get started !"""
    )
    
    st.info("Since we trained our machine learning model on english emails, the spam detector only works with english content.")
    
    
    # Beginning of interractions with user
    
    X_content=st.text_area("1) Paste body content:","Enter a message...")
    X_content=str(X_content)
    X_subject=st.text_area("2) Paste subject content:","Enter a message...")
    X_subject=str(X_subject)
    X_from=st.text_input("3) Paste issuer address (FROM):","Enter an address...")
    X_from=str(X_from)
    
    if X_content=='Enter a message...':
        X_content=None
    if X_subject=='Enter a message...':
        X_subject=None
    if X_from=='Enter an address...':
        X_from=None
    
    
    options = st.selectbox('4) Select Content-Type:',
                                  ('plaintext', 'html'))
    
    if options=='plaintext':
        X_type_html=0
        X_type_plain=1
    else:
        X_type_html=1
        X_type_plain=0
                
    
    # Start analysis only if the user ckecks the 'Analyze' button and filled in all text areas
    
    if  st.button('Analyze'):
        try:
            #### Creation of numeric features ####
            #nb of words
            count_body = len(re.findall(r'\w+', X_content))
            count_subject= len(re.findall(r'\w+', X_subject))
            
            #list of words
            words_body=re.findall(r'\w+', X_content)
            words_subject=re.findall(r'\w+', X_subject)
            
            #length of message and subject
            body_len=len(X_content)
            subject_len=len(X_subject)
            
            #nb of upper words
            for word in words_body:
                if len(word)>1:
                    upper_words_body_nb = sum(1 for c in words_body if c.isupper())
            
            for word in words_subject:
                if len(word)>1:
                    upper_words_subject_nb = sum(1 for c in words_subject if c.isupper())
            
            #nb of special characters
            liste_ponct = ['€','!','"','#','$','%','&','(',')','*','+',',','-','.','/',':',';','<','=','>','?','@','[',']','^','_','`','{','|','}','~']
            
            temp = sum(1 for c in X_content if c in liste_ponct)
            if temp > 0:
                symb_body_nb=temp
            else:
                symb_body_nb=0
                 
            temp = sum(1 for c in X_subject if c in liste_ponct)
            if temp > 0:
                symb_subject_nb=temp
            else:
                symb_subject_nb=0
            
            #nb of http url
            r=re.compile(r"http?://[a-zA-z0-9./-]+")
            
            http_body_nb = len(re.findall(r, X_content))
            
            # space nb + ratio (nb_space/message_len)
            space_body_nb=0
            space_body = sum(1 for space in X_content if space in ' ')
            if space_body > 0 :
                space_body_nb=space_body
                
            for word in words_body:
                if len(word)>1:
                        space_body_ratio=space_body_nb/body_len
            
            #presence of a figure in issuer address (i.e From field)
            if '0' in X_from:
                n = 1
            elif '1' in X_from:
                n=1
            elif '2' in X_from:
                n=1
            elif '3' in X_from:
                n=1
            elif '4' in X_from:
                n=1
            elif '5' in X_from:
                n=1
            elif '6' in X_from:
                n=1
            elif '7' in X_from:
                n=1
            elif '8' in X_from:
                n=1
            elif '9' in X_from:
                n=1
            else:
                n=0
            figure_issuer_address=n
            
            #nb of words not in dictionary (english) (we refer to a dictionnary composed of 236736 words)
            
            from nltk.corpus import words
            word_list_en = set(words.words())
            
            count_body_dico=0
            for word in words_body:
                if word not in word_list_en:
                    count_body_dico+=1
            not_dico_body=count_body_dico
            
            count_subject_dico=0
            for word in words_subject:
                if word not in word_list_en:
                    count_subject_dico+=1
            not_dico_subject=count_subject_dico
            
            #nb of sexual adults words
            adult_words=['sex','sexual','erotic','p**n','xxx','adult','sexy']
            
            count_adult_body=0
            for word in words_body:
                if word in adult_words:
                    count_adult_body+=1
            adult_words_body=count_adult_body
            
            count_adult_subject=0
            for word in words_subject:
                if word in adult_words:
                    count_adult_subject+=1
            adult_words_subject=count_adult_subject
            
            # creation of dataframe
             
            s1=pd.Series({'words_body_nb':count_body, 'body_len':body_len,'upper_words_body_nb':upper_words_body_nb,
                   'body_content':X_content,'http_body_nb':http_body_nb, 'symb_body_nb':symb_body_nb,
                     'text/html':X_type_html,'text/plain':X_type_plain,
                   'upper_words_subject_nb':upper_words_subject_nb,'symb_subject_nb':symb_subject_nb,
                  'words_subject_nb':count_subject,'subject_len':subject_len, 
                   'subject_len':subject_len,'adult_words_body':adult_words_body,
                   'adult_words_subject':adult_words_subject,'not_dico_body':not_dico_body,
                   'not_dico_subject':not_dico_subject,'figure_issuer_address':figure_issuer_address,
                    'space_body_nb':space_body_nb,'space_body_ratio':space_body_ratio,
                   'subject_content':X_subject
                    })
        
            df1=pd.DataFrame([s1])
            df_body=df1.copy()
            df_subject=df1.copy()
       
            # ***** Prediction of spam/ham *****
            
            # Loading of joblib objects already fitted with thousands of data
            model =load("rf.joblib")
            vectorizer_body = load("vectorizer_body.joblib")
            vectorizer_subject = load("vectorizer_subject.joblib")
            
            # Tokenization of text contents
            X_content_nlp = vectorizer_body.transform([X_content])
            X_subject_nlp = vectorizer_subject.transform([X_subject])
            X_nlp_test3 = hstack((X_content_nlp,X_subject_nlp, df1.drop(columns=['body_content','subject_content']).values))
            
            # Start prediction 
            pred=model.predict(X_nlp_test3)
            
            if pred==1:
                st.markdown("""## **It might be considered as spam**
                            """)
                style_spam = '''
                <style>
                    
                    h2>strong {
                        
                        color : #b74355
                    	;
                    }
                    
                    </style>
                    '''
                st.markdown(style_spam, unsafe_allow_html=True)
    
            else:
                st.markdown("""## **It might be considered as ham**
                            """)
                style_spam = '''
                <style>
                    
                    h2>strong {
                        
                        color : #99d594
                    	;
                    }
                    
                    </style>
                    '''
                st.markdown(style_spam, unsafe_allow_html=True)
            
            tabs = bokeh.models.Tabs(
        tabs=[
            body_analysis_panel(df_body),subject_analysis_panel(df_subject)
        ]
    )
            st.markdown("""
            ### Want To Learn More About The Results?
            """)
            st.bokeh_chart(tabs)
    
        # Start the analysis only if the user filled in all text areas
        except TypeError:
            st.write("First you need to fill in the text areas")
예제 #36
0
def random_text_corpus():
    corpus = pd.DataFrame([(i, ' '.join(random.sample(words.words(), 20)))
                           for i in range(100)],
                          columns=["@id", "desc"])
    return corpus
예제 #37
0
def checkWords(string):
    return string in words.words() or len(string) > 5
예제 #38
0
def get_word():

    word_list = words.words()
    number = random.randint(0, len(word_list) - 1)
    return word_list[number]
예제 #39
0
def filter_data(data, removeEng=False):

    # Given a data object remove any transcriptons with undesirable features
    to_remove = string.punctuation + "…’“–”‘°"
    # Any words you want to ignore
    special_cases = ["<silence>"]
    translation_tags = set(['@eng@', '<ind:', '<eng:'])
    cleaned_data = []

    if removeEng:
        use_langid = False
        if use_langid:
            from langid.langid import LanguageIdentifier, model
            identifier = LanguageIdentifier.from_modelstring(model,
                                                             norm_probs=True)
        from nltk.corpus import words
        eng_words = set(words.words())
        # Using both 2.16% english in wordlist 14.6k words(slow)
        # Using nltk dictionary 2.49% englisb in wordlist 15.5k words (fast)
        # Using neither 11.1% english in wordlist 34.8k words (fast)
        # Only words > 3 chars are counted, for audio-segment sample
        # Using remove english and ignore after '<' 1.8% 20.4K

    for utt in data:
        # print(utt, file=sys.stderr)

        trans = utt.get('transcript').lower()
        words = trans.split()

        # Note this is an assumption only translations come after '<'
        # if "<" in trans:
        # r = re.search(r'[<]@?(eng|indo|ind|mala)', trans)
        # if bool(r):
        #     words = trans[:r.span()[0]].split()

        clean_words = []
        valid_utterance = True
        eng_count = 0
        for word in words:
            # If a special case ignore
            if word in special_cases:
                continue

            # If utterance contains a translation
            if word in translation_tags:  # Translations / ignore
                break

            # If partial digit, throw out whole utterance
            if bool(re.search(r'\d', word)) and not word.isdigit():
                valid_utterance = False
                break

            # Remove punctuation and bad chars
            for char in to_remove:
                word = word.replace(char, '')

            # If word is in english dictionary count it
            if removeEng and len(word) > 3 and word in eng_words:
                # print(word, file=sys.stderr)
                eng_count += 1

            clean_words.append(word)

        # Exclude utterance if empty after cleaning
        cleaned_trans = ' '.join(clean_words).strip()
        if cleaned_trans == "":
            valid_utterance = False

        # Exclude utterance if > 10% english
        if removeEng and len(
                clean_words) > 0 and eng_count / len(clean_words) > 0.1:
            # print(round(eng_count / len(clean_words)), trans, file=sys.stderr)
            valid_utterance = False

        # Exclude utterance if langid thinks its english
        if removeEng and use_langid and valid_utterance:
            lang, prob = identifier.classify(cleaned_trans)
            if lang == 'en' and prob > 0.5:
                valid_utterance = False

        # Something was bad in utterance
        if not valid_utterance:
            continue

        # Should be a clean valid utterance
        utt['transcript'] = cleaned_trans
        cleaned_data.append(utt)
    return cleaned_data
예제 #40
0
    global letters_dicts
    for i in alphabet:
        letters_dict.__setitem__(i, 0)
    letters_dict.__setitem__('`', -1)

alpha_dict()


number_of_letters = int(input("How many letters are in your word?: "))


for i in range(number_of_letters):
    show_list.append("_")

number_of_attempts = int(input("How many attempts do you give the computer?: "))
word_list = word_list + w.words()
for i in word_list:
    if len(i) == number_of_letters:
        candidates.append(i)

def show_user():
    global show_list
    update_show_list()
    print("  ".join(show_list))

def check_for_best_word():
    global candidates
    global common_word_list
    for i in range(len(common_word_list)):
        for j in range(len(candidates)):
            if common_word_list[i] == candidates[j]:
예제 #41
0
from textblob import TextBlob
from nltk.corpus import stopwords, words

stops = set(stopwords.words())
english_words = set(w.lower() for w in words.words())


def preprocess(corpus):
    blob = TextBlob(corpus)
    ret = []
    for sent in blob.sentences:
        sent_list = []
        for word in sent.words.lower().lemmatize():
            if word in english_words and word not in stops:
                sent_list.append(word)
        ret.append(sent_list)
    return ret


def tfidf_preprocess(corpus):
    blob = TextBlob(corpus)
    ret = []
    for word in blob.words.lower():
        if word in english_words and word not in stops:
            ret.append(word)
    return ' '.join(ret)
from functions import get_filename_list
import json
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords, words
import pickle


def stemm_paragraph(paragraph):
    return [ps.stem(word) for word in word_tokenize(paragraph)]


ps = PorterStemmer()
stop_words = stopwords.words('english')
valid_words = set([ps.stem(x) for x in words.words()])

index = {}

for k, filename in enumerate(get_filename_list()):

    print(k, filename)

    json_file = json.loads(open(filename).read())

    paragraphs = [x['text'] for x in json_file['body_text']]

    for i, paragraph in enumerate(paragraphs):

        stemmed_paragraph_tokens = stemm_paragraph(paragraph)

        for token in stemmed_paragraph_tokens:
예제 #43
0
import argparse
import os
import json
from spacy.lang.en import stop_words
import twikenizer as twk
import re
import string
from nltk.corpus import words


spacy_stopwords = stop_words.STOP_WORDS
tokenizer = twk.Twikenizer()

word_dictionary = list(set(words.words()))
for alphabet in "bcdefghjklmnopqrstuvwxyz":
    word_dictionary.remove(alphabet)

def append_json(json_data, file_path):
    """
    Write json to file
    :param json_data: Json to write
    :param file_path: Path of the file
    """
    with open(file_path, 'a+') as f:
        json.dump(json_data, f)
        f.write("\n")

def split_hashtag_to_words_all_possibilities(hashtag):
    all_possibilities = []

    split_posibility = [hashtag[:i] in word_dictionary for i in reversed(range(len(hashtag)+1))]
예제 #44
0
def random_words():
    corpus = list(random.sample(words.words(), 700))
    return corpus
예제 #45
0
import time
import pandas as pd
import re
import argparse
from json import JSONDecoder, JSONDecodeError
import io
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('words')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import words
stop_words = set(stopwords.words('english'))
stopword_set = set(stopwords.words())
word_set = set(words.words())

from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
plt.switch_backend('Agg')
### set up plotting parameters
from matplotlib import rcParams
plt.style.use('seaborn-poster')
plt.rcParams['font.family'] = 'serif'
rcParams['font.sans-serif'] = ['Palatino']
rcParams['figure.max_open_warning'] = 30

### This saves the credentials
KIDS = config.KIDS
CUR_FILE_DIR = os.path.dirname(os.path.realpath(__file__))
예제 #46
0
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)


def edits2(word):
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))


#%%
#Custom tagging each word
english_vocab = set(w.lower() for w in wd.words())
stop_words = stopwords.words('english')


#wn.synsets('motorcar')
def CustomTag(word):

    if word.isnumeric():
        return "Num"
    elif word.isalnum() and not word.isnumeric() and not word.isalpha():
        return "Mixed"
    elif word in punctuation:
        return "Pun"
    elif word in english_vocab:  #wn.synsets(word):
        return "Known"
    elif word in stop_words:
예제 #47
0
nlp = spacy.load('en_core_web_lg')


def get_lemma(word):
    if len(word.split()) != 1:
        raise ValueError('Only input >>>single<<< words')
    doc = nlp(word)
    lemma = doc[0].lemma_
    return (lemma)


recommended = load_lexicon('recommended')

# Get NLTK corpus
print('Lemmatizing NLTK')
words = words.words()
lemmas = [get_lemma(word) for word in tqdm(words, desc='NLTK lemmas')]
lemmas = set(lemmas)

print('Lemmatizing SNOMED')
recommended_lemmas = []
for syn in tqdm(recommended, desc='SNOMED CT lemmas'):
    doc = nlp(syn.term)
    for token in doc:
        recommended_lemmas.append(token.lemma_)

recommended_lemmas = set(recommended_lemmas)

remove_list = list(lemmas - recommended_lemmas)

with open("dict_strings.csv", 'w', newline='') as resultFile:
예제 #48
0
        if "http" not in line:
            current_category = line
            continue
        else:
            url = line
            con = urllib2.urlopen(url, timeout=60)
            html = con.read()
            soup = BeautifulSoup(html)
            page = soup.findAll('p')  #.getText()
            word_data = {}
            for i in page:
                word_list = i.getText().split(" ")
                for word in word_list:
                    real_word = re.sub('[^a-zA-Z]', '', word)
                    print current_category + "..checking.." + real_word
                    if real_word not in words.words() or real_word == "":
                        continue
                    if real_word not in word_data:
                        word_data[real_word] = {}
                    if current_category not in word_data[real_word]:
                        word_data[real_word][current_category] = 0
                    else:
                        word_data[real_word][current_category] += 1

# write to csv file

with open("result.csv", 'wb') as file:
    a = csv.writer(file)
    for word in word_data:
        entry = []
        entry.append(word)
예제 #49
0
 def _get_words(self):
     from nltk.corpus import words
     return words.words()
예제 #50
0
from nltk.tokenize import wordpunct_tokenize
from nltk.probability import FreqDist
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth
from sklearn.manifold import MDS

from scipy.cluster.hierarchy import ward, dendrogram

from gensim import corpora, models

# filter stopwords
stopwords = set(stopwords.words('english'))
en_words = set(words.words())
stemmer = SnowballStemmer('english')
book_id = [
    '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12',
    '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24'
]
titles = [
    'THE HISTORY OF THE DECLINE AND FALL OF THE ROMAN EMPIRE. VOL. VI',
    'THE HISTORIES CAIUS COBNELIUS TACITUS',
    'THE WORK OF JOSEPH US, THE JEWISH WAR. VOL. IV',
    'THE HISTORY OF THE DECLINE AND FALL OF THE ROMAN EMPIRE. VOL, I',
    'THE HISTORY OF TACITUS. BOOK I. VOL. V',
    'THE FIRST AND THIRTY-THIRD BOOKS OF PLINY\'S NATURAL HISTORY',
    'THE HISTORY OF THE ROMAN EMPIRE. VOL. V',
    'THE HISTORY OF THE DECLINE AND FALL OF THE ROMAN EMPIRE. VOL. II',
    'THE HISTORY OF THE PELOPONNESIAN WAR. VOL. II',
예제 #51
0

def extract_runtime(runtime_str):
    runtime = 0
    elems = runtime_str.split(' ')
    if len(elems) > 1:
        runtime += int(elems[0]) * 24 * 60 * 60
        t = elems[1]
    else:
        t = elems[0]
    telems = t.split(':')
    runtime += int(telems[0]) * 60 * 60 + int(telems[1]) * 60 + int(telems[2])

    return runtime


extract_runtime_udf = udf(extract_runtime, StringType())


nltk.download('words')
english_words = words.words()
salt = str(random.SystemRandom().random())


def anonymize(s):
    random.seed(str(s) + salt)
    return random.choice(english_words)


anonymize_udf = udf(anonymize, StringType())
예제 #52
0
from nltk.corpus import words
setofwords = set(words.words())


def dict_check(list):
    valid_words = []
    for l in list:
        if l in setofwords:
            valid_words.append(l)
    return valid_words
예제 #53
0

def create_base_network(input_shape):
    input = Input(shape=input_shape)
    x = ZeroPadding2D(padding=(0, 1), data_format='channels_last')(input)
    x = Conv2D(64, (27, 3), activation='relu')(x)
    x = Flatten()(x)
    x = Dense(1024, activation='relu')(x)
    return Model(input, x)


def binary_accuracy(output_true, output_pred):
    return K.mean(K.equal(output_true, K.round(output_pred)), axis=-1)


words_list = words.words()
book = nltk.corpus.gutenberg.words(u'austen-persuasion.txt')
book_text = nltk.Text(book)
words_list2 = book_text.tokens

alphabet = []
for letter in range(97, 123):
    alphabet.append(chr(letter))

words_to_train1 = w.create_wordlist(words_list, words_number)
words_to_train2 = w.create_wordlist(words_list, words_number)
words_to_val1 = w.create_wordlist(words_list2, words_number)
words_to_val2 = w.create_wordlist(words_list2, words_number)
labels_train = w.create_labels(words_to_train1, words_to_train2, words_number)
labels_val = w.create_labels(words_to_val1, words_to_val2, words_number)
예제 #54
0
def is_word(s: str) -> bool:
    return s.lower() in words.words()
예제 #55
0
#   11-June-2020    | Tapas Mohanty                 | Jahar Sil and Abhisek Kumar                        | Initial Release
# =========================================================================================================================
# =========================================================================================================================
# Import required Module / Packages
# -------------------------------------------------------------------------------------------------------------------------

import nltk
import re
from bs4 import BeautifulSoup
import unicodedata
from contractions import CONTRACTION_MAP
from nltk.corpus import wordnet
from nltk.tokenize.toktok import ToktokTokenizer
import en_core_web_sm
from nltk.corpus import words
engwords = words.words()
import traceback

###########################################################################################################################
# Author        : Tapas  Mohanty
# Co-Author     : Jahar Sil and Tapas Mohanty
# Modified      :
# Reviewer      :
# Functionality : Tokenizing the keywords
###########################################################################################################################

tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
# nlp = spacy.load('en', parse=True, tag=True, entity=True)
nlp = en_core_web_sm.load()
# nlp_vec = spacy.load('en_vectors_web_lg', parse=True, tag=True, entity=True)
예제 #56
0
from siteswapClass import siteswap
import nltk
from nltk.corpus import words
##nltk.download()
for i in range(0, 10000):
    if siteswap(words.words()[i]).isValid():
        print(words.words()[i])
예제 #57
0
 def wordUB(self,word):
     if word not in words.words() and word in self.ud:
         return(True)
     else:
         return(False)
예제 #58
0
import re

from nltk.corpus import stopwords, words

from tools.utils import save_and_reload_df
from tools.data_handling import enrich_emails, unique_recipients, address_book
from tools.features import stem

stopwords = set(stopwords.words("english"))
english_words = set(words.words())


def remove_after_indicator(text, indicator):
    '''Removes everything in text after indicator if found. If not found, leaves
    text as is.
        Arguments:
            - text (str): the text you want to shorten.
            - indicator (str): the indicator after which you want to cut the
            text.
        Output:
            - str: the shortened text.
    '''
    indic_match = re.search(indicator, text)
    if indic_match:
        simple_text = text[:indic_match.span(0)[0]]
    else:
        simple_text = text
    return simple_text


def remove_punctuation(text):
예제 #59
0
from gensim.corpora import Dictionary
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_distances

tqdm.pandas()

##########################################################################
# Word Resources
##########################################################################
nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')

some_other_words = ['your exclude words']

Englishtext = set(W.words())

my_stopwords = {'able'}
stop_words = set([word for word in stopwords.words('english')])
stop_words = stop_words.union(STOPWORDS).union(STOP_WORDS).union(my_stopwords)

##########################################################################
# Data Cleaning functions for projects SOW
##########################################################################
'''
This function expect no nan in the inout datafram, so replace nan with empty string

projectSowData is a pandas dataframe where the columns need to
 include projectID and StatementOfWork

it will return a pandas df and a dictionary, the df is the filtered dataframe
예제 #60
0
import numpy as np
import linecache
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import io
from rake_nltk import Rake
from nltk.corpus import words
import string
from IPython.display import display_html
from itertools import chain,cycle

my_exclusions = ['--', '–', 'i', 'ii', 'iii', 'iv', 'v', 'vi', 'okay', 'et', 'cetera']
exclusion_list = words.words() + my_exclusions

def display_side_by_side(*args,titles=cycle([''])):
    '''
    Pulled from stackoverflow by @ntg
    Displays dataframes side by side and allows setting titles for each dataframe
    '''
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h2>{title}</h2>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)
    return None

def pdfparser(data):