Python suggest示例，pattern.en.suggest Python示例

示例#1

0

显示文件

 def test_spelling(self):
     # Assert case-sensitivity + numbers.
     for a, b in (
       (   ".", "."   ),
       (   "?", "?"   ),
       (   "!", "!"   ),
       (   "I", "I"   ),
       (   "a", "a"   ),
       (  "42", "42"  ),
       ("3.14", "3.14"),
       ( "The", "The" ),
       ( "the", "the" )):
         self.assertEqual(en.suggest(a)[0][0], b)
     # Assert spelling suggestion accuracy.
     # Note: simply training on more text will not improve accuracy.
     i = j = 0.0
     from pattern.db import Datasheet
     for correct, wrong in Datasheet.load(os.path.join(PATH, "corpora", "spelling-birkbeck.csv")):
         for w in wrong.split(" "):
             if en.suggest(w)[0][0] == correct:
                 i += 1
             else:
                 j += 1
     self.assertTrue(i / (i + j) > 0.70)
     print("pattern.en.suggest()")

示例#2

0

显示文件

文件： verb.py 项目： jwill49/autograder

def tense_of_word(word):
	tense = None
	tag   = word.tag
	word  = word.string
		
	# Use wordnet suggestion if word is mispelled
	word = word if suggest(word)[0][1] == 1.0 else suggest(word)[0][0]
	
	# Check defined rules first
	tense = FUTURE if word in rules.FUTURE_INDICATORS else tense
	tense = PAST if word in rules.PAST_INDICATORS else tense
	
	if tense is not None:
		return tense
		
	# check pre-defined rules
	tense = PAST if tag in rules.PAST else tense
	tense = PRESENT if tag in rules.PRESENT else tense
	tense = FUTURE if tag in rules.FUTURE else tense
	
	if tense is not None:
		return tense
		
	# Now check pattern.en tense
	tense = PAST if tense_of_word_h(word, PAST) and not tense_of_word_h(word, PRESENT) and not tense_of_word_h(word, FUTURE) else tense
	tense = PRESENT if not tense_of_word_h(word, PAST) and tense_of_word_h(word, PRESENT) and not tense_of_word_h(word, FUTURE) else tense
	tense = FUTURE if not tense_of_word_h(word, PAST) and not tense_of_word_h(word, PRESENT) and tense_of_word_h(word, FUTURE) else tense
	
	return tense

示例#3

0

显示文件

文件： test_en.py 项目： DataBranner/pattern

 def test_spelling(self):
     # Assert case-sensitivity + numbers.
     for a, b in (
             (".", "."),
             ("?", "?"),
             ("!", "!"),
             ("I", "I"),
             ("a", "a"),
             ("42", "42"),
             ("3.14", "3.14"),
             ("The", "The"),
             ("the", "the")):
         self.assertEqual(en.suggest(a)[0][0], b)
     # Assert spelling suggestion accuracy.
     # Note: simply training on more text will not improve accuracy.
     i = j = 0.0
     from pattern.db import Datasheet
     for correct, wrong in Datasheet.load(os.path.join(PATH, "corpora", "spelling-birkbeck.csv")):
         for w in wrong.split(" "):
             if en.suggest(w)[0][0] == correct:
                 i += 1
             else:
                 j += 1
     self.assertTrue(i / (i + j) > 0.70)
     print("pattern.en.suggest()")

示例#4

0

显示文件

def spelling_correct(token_list):
    res_correct = []
    for word in token_list:
        if not check_english(word) :
            word_correct = max(dict(suggest(word)), key=lambda k: dict(suggest(word))[k])
            res_correct.append(word_correct)
        else:
            res_correct.append(word)

    return res_correct

示例#5

0

显示文件

def spell():
    if request.method == 'POST':  #this block is only entered when the form is submitted
        word = request.form.get('word')
        
        return jsonify({'data':json.dumps(suggest(word))})

    return '''<form method="POST">

示例#6

0

显示文件

文件： preprocessing.py 项目： tzhaowen/LHS712-Project

def data_preprocessing(X):
    '''
    text pre-processing 
    '''
    # convert all characters to lower case
    X = [x.lower() for x in X]
    # remove random characters out of ASCII in the text
    X = [x.encode("ascii","ignore") for x in X]
    X = [x.decode() for x in X]
    # remove the meaningless "_U" in the text
    X = [re.sub('_u',' ', x) for x in X]
    # replace @username with 
    X = [re.sub('@\w+','username',x) for x in X]
    # remove website links
    X = [re.sub(r'(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?','', x+' ') for x in X]
    # remove symbol
    X = [re.sub('[/(){}\[\]\|@,;]',' ', x) for x in X]
    X = [re.sub('[^0-9a-z ]',' ', x) for x in X]
    # consolidate multiple spaces
    X = [re.sub(' +',' ', x) for x in X]

    # spell correction
    for i,x in enumerate(X):
        print("[INFO] this is {}/{} tweet! ".format(i,len(X)))
        words = x.split()
        for j,word in enumerate(words):
            if word not in DRUGLIST:
                word = reduce_lengthening(word)
                try:
                    suggestion = suggest(word)[0]
                except:
                    suggestion = suggest(word)[0]

                if suggestion[1]>0.8: # do not change words with low confidence
                    words[j] = suggestion[0]
                else:
                    pass
                    # print(word,suggestion)
            else:
                word = 'drugname'# replace the drugnames with drugname
        X[i] = ' '.join(words)

    # remove stop words
    STOPWORDS = set(stopwords.words('english'))
    for i,x in enumerate(X):
        X[i] = ' '.join([word for word in x.split() if word not in STOPWORDS])
    return X

示例#7

0

显示文件

def get_english_suggestions(word):
    suggestions = []
    for cur in suggest(word):
        cur_word = cur[0]
        if (cur[1] != 0):
            suggestions.append(cur_word)

    if (len(suggestions) < 5):
        return suggestions
    else:
        return suggestions[0:5]

示例#8

0

显示文件

文件： test_en.py 项目： EricSchles/pattern

 def test_spelling(self):
     # Assert spelling suggestion accuracy.
     # Note: simply training on more text will not improve accuracy.
     i = j = 0.0
     from pattern.db import Datasheet
     for correct, wrong in Datasheet.load(os.path.join(PATH, "corpora", "spelling-birkbeck.csv")):
         for w in wrong.split(" "):
             if en.suggest(w)[0][0] == correct:
                 i += 1
             else:
                 j += 1
     self.assertTrue(i / (i+j) > 0.70)
     print "pattern.en.suggest()"

示例#9

0

显示文件

文件： test_en.py 项目： lazycrazyowl/pattern

 def test_spelling(self):
     # Assert spelling suggestion accuracy.
     # Note: simply training on more text will not improve accuracy.
     i = j = 0.0
     from pattern.db import Datasheet
     for correct, wrong in Datasheet.load(os.path.join(PATH, "corpora", "spelling-birkbeck.csv")):
         for w in wrong.split(" "):
             if en.suggest(w)[0][0] == correct:
                 i += 1
             else:
                 j += 1
     self.assertTrue(i / (i+j) > 0.70)
     print "pattern.en.suggest()"

示例#10

0

显示文件

文件： spell_correction_transformer.py 项目： Sanyam07/Machine-Learning

    def apply_spell_correction(self, x):
        """

        :param x:
        :return:
        """
        try:
            pattern = re.compile(r"(.)\1{2,}")
            list_of_elem = x.split(" ")
            clean_x = [pattern.sub(r"\1\1", i) for i in list_of_elem]
            suggest_val = [suggest(i)[0][0] for i in clean_x]
            return " ".join(suggest_val)

        except Exception as e:
            logger.error(e)

示例#11

0

显示文件

文件： Grammar_Benesse_RuleBased_Updated.py 项目： anmolsinghal98/Rule-Based-System-for-Grammar-Error-Correction

def checkSpellingError(text, nlp, correctFlag=False):
    '''
    Purpose: To check if text has errors due to wrong spellings.
             Additionally, it returns corrected sentence.
    
    Parameters: text: string
                    A string of text-single or a paragraph.
                
                correctFlag:boolean 
                   True or False
                    
    Returns: count: integer  
             text: Corrected sentence. (If correctFlag is True)
    '''
    doc = nlp(text)
    count = 0
    text = ""
    for sen in doc.sentences:
        for word in sen.words:
            #print(word.text.lower())
            l = ["'s", "n't", "'ll"]
            try:
                sugList = suggest(word.text.lower())
            except:
                sugList = []
                l.append(word.text.lower())
            for k in sugList:
                l.append(k[0])
            if (word.text.lower() in l) or (word.lemma in l):
                text += word.text
                text += " "
                continue
            else:
                count += 1
                text += sugList[0][0]
                text += " "
    if correctFlag == True:
        return count, text
    else:
        return count

示例#12

0

显示文件

文件： charles.py 项目： SamFinni/COSC310-Chatbot

def spellCheck(uIn):
    uIn = shortenWords(uIn)
    unchecked = uIn.split(' ')
    checked = ""
    end = ""

    #preserves punctuation at end of user's input
    if any(p in uIn[-1:] for p in punctuation):
        end = uIn[-1:]

    first = True  #check for first run to prevent leading space
    for w in unchecked:
        suggestion = suggest(w)
        word = suggestion[0][0]
        for i in suggestion:
            if (w == i[0] or w[0].isupper()):
                word = w
        if not first:
            checked += ' '
        else:
            first = False
        checked += word
    return checked + end

示例#13

0

显示文件

文件： definition.py 项目： esotericnomen/vocab_prep

		for word in wrong:
			print bcolors.Red +"                "+word+" :: "+subprocess.check_output(["espeak", "-q", "--ipa",'-v', 'en-us', word]).decode('utf-8')+bcolors.White
		print bcolors.White + "Completed spell learning"
		sys.exit()

	if(sup_spellbee):
		for word in word_list.split():
			if len(wn.synsets(word)) is not 0:
				tcount = 0
				learnt = 0

				entity =(word, tcount)
				wrong.append(entity)
			else:
				print suggest(word)
		rspellbee()
		correct = sorted(correct,key=lambda x: x[1],reverse=True)
		for entity in correct:
			if(entity[1] is not 0):
				print "%20s : %d" % (entity[0],entity[1])
		print bcolors.White + "Completed spell bee"
		sys.exit()

	for word in word_list.split():
		if len(wn.synsets(word)) is not 0:
			#rlemma = l.lemmatize(word)
			iterator = iterator + 1
			if(len(sys.argv) is 5):
				if(iterator < int(sys.argv[3])):
					continue

示例#14

0

显示文件

def sp(text):
    print("sp")
    sp_chk.append(suggest(text))
    return()

示例#15

0

显示文件

文件： pattern_en.py 项目： vishalbelsare/pattern_CLiPS

print 'p' in tenses('purred')  # By alias.
print PAST in tenses('purred')
print(PAST, 1, PL) in tenses('purred')
# rule-based conjugation
print 'google' in verbs.infinitives
print 'googled' in verbs.inflections
print conjugate('googled', tense=PARTICIPLE, parse=False)
print conjugate('googled', tense=PARTICIPLE, parse=True)
# quantification
print number("seventy-five point two")  # "seventy-five point two" => 75.2
print numerals(2.245, round=2)  # 2.245 => "two point twenty-five"
print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken'])
print quantify({'carrot': 100, 'parrot': 20})
print quantify('carrot', amount=1000)
# spelling
print suggest("parot")
# n-grams
print ngrams("I am eating pizza.", n=2)  # bigrams
print ngrams("I am eating pizza.",
             n=3,
             punctuation=".,;:!?()[]{}`''\"@#$^&*+-|=~_",
             continuous=False)
# parser
print parse(
    'I eat pizza with a fork.',
    tokenize=True,  # Split punctuation marks from words?
    tags=True,  # Parse part-of-speech tags? (NN, JJ, ...)
    chunks=True,  # Parse chunks? (NP, VP, PNP, ...)
    relations=False,  # Parse chunk relations? (-SBJ, -OBJ, ...)
    lemmata=False,  # Parse lemmata? (ate => eat)
    encoding='utf-8',  # Input string encoding.

示例#16

0

显示文件

 def test_spelling_punctuation(self):
   self.assertEqual(en.suggest("!"), [("!", 1.0)])
   self.assertEqual(en.suggest("?"), [("?", 1.0)])
   self.assertEqual(en.suggest("."), [('.', 1.0)])

示例#17

0

显示文件

文件： correct_spelling.py 项目： MichelHalmes/nltk-experiments

    def __init__(self, dict_name='en', max_dist=2):
        self.spell_dict = enchant.Dict(dict_name)
        self.max_dist = max_dist

    def replace(self, word):
        if self.spell_dict.check(word):
            return word
        suggestions = self.spell_dict.suggest(word)

        if not suggestions:
            return word

        return min(suggestions, key=lambda sugg: edit_distance(word, sugg))

        # for sugg in suggestions:
        #     print sugg, edit_distance(word, sugg)


if __name__=="__main__":
    SENTENCE = 'Yesteday I wrnt to the pqrk!'.split(' ')

    print " ".join([suggest(word)[0][0] for word in SENTENCE])

    sr = SpellingReplacer()


    print " ".join([sr.replace(word) for word in SENTENCE])


    # print " ".join([d.suggest(word) for word in SENTENCE])

示例#18

0

显示文件

文件： test_en.py 项目： NaturalLanguage/pattern

 def test_spelling_numbers(self):
     self.assertEqual(en.suggest("42"), [("42", 1.0)])
     self.assertEqual(en.suggest("3.1415"), [("3.1415", 1.0)])

示例#19

0

显示文件

from pattern.en import suggest
from aspell import Speller

# The algorithm manually implemented at spelling_correction_manual.py is available to be
# used out of the box in the pattern library:
# https://www.clips.uantwerpen.be/pattern.
# (Compatible with python 2.7 only)
#
# "Pattern is a web mining module for the Python programming language.
#  It has tools for data mining (Google, Twitter and Wikipedia API, a web crawler, a HTML DOM parser),
#  natural language processing (part-of-speech taggers, n-gram search, sentiment analysis, WordNet),
#  machine learning (vector space model, clustering, SVM), network analysis and <canvas> visualization."

# Other libraries are:
# PyEnchant: http://pythonhosted.org/pyenchant/
# AspellPython, wrapper around GNU Aspell: https://github.com/WojciechMula/aspell-python (requires libaspell-dev)
#   sudo apt install libaspell-dev
#   pip install aspell-python-py3

print(suggest("fianlly"))

print(suggest("flaot"))

sp = Speller()
print(sp.suggest("fianlly"))

示例#20

0

显示文件

文件： spelling_corrector.py 项目： songys/Transfer_Learning_Study

    word = match.group()
    def case_of(text):
        """
        Return the case-function appropriate 
        for text: upper, lower, title, or just str.:
            """
        return (str.upper if text.isupper() else
                str.lower if text.islower() else
                str.title if text.istitle() else
                str)
    return case_of(word)(correct(word.lower()))

    
def correct_text_generic(text):
    """
    Correct all the words within a text, 
    returning the corrected text.
    """
    return re.sub('[a-zA-Z]+', correct_match, text)



print(correct_text_generic('fianlly'))


from pattern.en import suggest

print(suggest('fianlly'))
print(suggest('flaot'))

示例#21

0

显示文件

文件： definition.py 项目： esotericnomen/vocab_prep

            print bcolors.Red + "                " + word + " :: " + subprocess.check_output(
                ["espeak", "-q", "--ipa", '-v', 'en-us', word
                 ]).decode('utf-8') + bcolors.White
        print bcolors.White + "Completed spell learning"
        sys.exit()

    if (sup_spellbee):
        for word in word_list.split():
            if len(wn.synsets(word)) is not 0:
                tcount = 0
                learnt = 0

                entity = (word, tcount)
                wrong.append(entity)
            else:
                print suggest(word)
        rspellbee()
        correct = sorted(correct, key=lambda x: x[1], reverse=True)
        for entity in correct:
            if (entity[1] is not 0):
                print "%20s : %d" % (entity[0], entity[1])
        print bcolors.White + "Completed spell bee"
        sys.exit()

    for word in word_list.split():
        if len(wn.synsets(word)) is not 0:
            #rlemma = l.lemmatize(word)
            iterator = iterator + 1
            if (len(sys.argv) is 5):
                if (iterator < int(sys.argv[3])):
                    continue

示例#22

0

显示文件

文件： spelling_corrector.py 项目： 000Nelson000/text-analytics-with-python

    word = match.group()
    def case_of(text):
        """
        Return the case-function appropriate 
        for text: upper, lower, title, or just str.:
            """
        return (str.upper if text.isupper() else
                str.lower if text.islower() else
                str.title if text.istitle() else
                str)
    return case_of(word)(correct(word.lower()))

    
def correct_text_generic(text):
    """
    Correct all the words within a text, 
    returning the corrected text.
    """
    return re.sub('[a-zA-Z]+', correct_match, text)



print correct_text_generic('fianlly')


from pattern.en import suggest

print suggest('fianlly')
print suggest('flaot')

示例#23

0

显示文件

def spell_checker():
    print(suggest('poblem'))

示例#24

0

显示文件

 def clean_text(self, text, short = True, length = True, contra = True, remove_stopwords = True, lemmatize = True, english = False, ngrams = False, spelling = True, spCy=False):
     print("cleantoo")
    # print("T:", text)
     """
         Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings
         check spelling, lemmatize and compare with wordnet corpus for english words
         Paras:
             text: text data to clean
             remove_stopwords: if true, remove stop words  text to reduce noise
             lemmatize: if true lemmatizes word
             english: if true compares w/ wordnet corpus to keep only english words
             ngrams: if true creates ngrams 
         Returns:
             text: cleaned text data
     """
     if contra:
         print("CLEAN")
         text = [self.remove_contractions(word) for word in sent_tokenize(text.lower())]
         text = " ".join(text)
 
         text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
         text = re.sub(r'\<a href', ' ', text)
         text = re.sub(r'&amp;', '', text) 
         text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
         text = re.sub(r'<br />', ' ', text)
         text = re.sub(r'\'', ' ', text)
         text = re.sub(r'[^a-zA-Z]', " ", text)
     
     if length:
         print("LENGTH")
     #text = text.split()
         text = self.reduce_lengthening(text)# for w in text]
     #text = " ".join(text)
     
     if spelling:
        print("SPELLING")
        text = text.split()
        word = suggest(text)
        max_word=max(word[:][1])
        #for w in self.token_nize(text)]
        text = " ".join(max_word[0])
     
     if remove_stopwords:
         print("STOP")
         text = text.split()
         stops = set(stopwords.words("english"))
         text = [w for w in text if not w in stops]
         text = " ".join(text)
         
     if short:
        print("SHORT")
        text = ' '.join([w for w in text.split() if len(w)>3])
        print("S: ", text)          
     
     if lemmatize:
         print("LEM")
         text_sent = nltk.word_tokenize(text)
         text = [WordNetLemmatizer().lemmatize(w, self.getWordnetPos(w)) for w in text_sent]
         text = " ".join(text)
     
     # Spacy Lemmtization
     if spCy:
         text = " ".join(self.spCy(text))
     
         
     if english:
         print("ENGLISH")
         text = ' '.join([w for w in text.split() if wordnet.synsets(w)])
         print("P: ", text)
         
     if ngrams:
         print("NGRAM")
         text = text.split()
         text = [self.nGrams(text)]
         #text = " ".join(text) 
         
     return text

示例#25

0

显示文件

def noise_generator(original_sentence_list, i, process_dict):
    noised_sentence_list = []
    for target_sentence in original_sentence_list:
        generated_source = []

        # x = x.lower() # this cause some error ignorance (Mec)
        x_split = nltk.word_tokenize(target_sentence)
        x_pos = nltk.pos_tag(x_split)

        # avoid too much error creation
        similar_flag = False
        replace_flag = False
        proposition_flag = False
        plural_flag = False
        tense_flag = False
        modal_flag = False
        incorrect_comparative_flag = False

        for token, pos in x_pos:

            similar_token = (pos in NOUN_TAGS and random.random() < 0.3
                             and not similar_flag)

            dropout_token = (token in DROPOUT_TOKENS and random.random() < 0.3)

            incorrect_comparative_token = (pos in ADJECTIVE_TAGS
                                           and random.random() < 0.3
                                           and not incorrect_comparative_flag)

            preposition_token = (pos in PREPOSITION_TAGS
                                 and random.random() < 0.3
                                 and not proposition_flag)

            replace_token = (token in REPLACEMENTS and random.random() < 0.3
                             and not replace_flag)

            pos_plural_token = (pos in NOUN_TAGS and random.random() < 0.3
                                and not plural_flag)

            pos_tense_token = (pos in VERBS_TAGS and random.random() < 0.3
                               and not tense_flag)

            pos_modal_token = (token in MODAL and random.random() < 0.3
                               and not modal_flag)

            if replace_token:
                generated_source.append(REPLACEMENTS[token])
                replace_flag = True
            elif similar_token:
                temp = token[:-1] + "_"
                cand_list = suggest(temp)
                cand = random.choice(cand_list)[0]
                generated_source.append(cand)
                similar_flag = True
            elif preposition_token:
                generated_source.append(random.choice(PREPOSITIONS))
                proposition_flag = True
            elif incorrect_comparative_token:
                generated_source.append(
                    random.choice(COMPARATIVES) + " " + token)
                incorrect_comparative_flag = True
            elif pos_plural_token:
                token = change_pluralization(token)
                generated_source.append(token)
                plural_flag = True
            elif pos_tense_token:
                token = change_tense(token)
                generated_source.append(token)
                tense_flag = True
            elif not dropout_token:
                generated_source.append(token)
            elif pos_modal_token:
                generated_source.append(MODAL[token])
                modal_flag = True

        noised_sentence_list.append(" ".join(generated_source))
    process_dict[i] = noised_sentence_list

示例#26

0

显示文件

文件： day7_pattern.py 项目： enliktjioe/28daysofnlp

sent = parse(text, lemmata=True)
sent = Sentence(sent)

print(modality(sent))

text = "I think we can complete this task"
sent = parse(text, lemmata=True)
sent = Sentence(sent)

print(modality(sent))

# ### Spelling Corrections

from pattern.en import suggest

print(suggest("Whitle"))

from pattern.en import suggest
print(suggest("Fracture"))

# ### Working with Numbers

from pattern.en import number, numerals

print(number("one hundred and twenty two"))
print(numerals(256.390, round=2))

from pattern.en import quantify

print(
    quantify([

示例#27

0

显示文件

def _suggest_aux(text, threshold=0.9):
    suggestions = suggest(text)
    if suggestions[0][1] > threshold:
        return suggestions[0][0]
    return text

示例#28

0

显示文件

文件： test_en.py 项目： NaturalLanguage/pattern

 def test_spelling_punctuation(self):
     self.assertEqual(en.suggest("!"), [("!", 1.0)])
     self.assertEqual(en.suggest("?"), [("?", 1.0)])
     self.assertEqual(en.suggest("."), [(".", 1.0)])

示例#29

0

显示文件

 def test_spelling_oneletter_words(self):
   self.assertEqual(en.suggest("I"), [("I", 1.0)])
   self.assertEqual(en.suggest("a"), [("a", 1.0)])

示例#30

0

显示文件

def correctSpelling(s):
    words = [en.suggest(w) for w in s.split()]
    return ' '.join(words)

示例#31

0

显示文件

    and preserve proper upper/lower/title case.
    """

    word = match.group()

    def case_of(text):
        """
        Return the case-function appropriate 
        for text: upper, lower, title, or just str.:
            """
        return (str.upper if text.isupper() else str.lower
                if text.islower() else str.title if text.istitle() else str)

    return case_of(word)(correct(word.lower()))


def correct_text_generic(text):
    """
    Correct all the words within a text, 
    returning the corrected text.
    """
    return re.sub('[a-zA-Z]+', correct_match, text)


print correct_text_generic('fianlly')

from pattern.en import suggest

print suggest('fianlly')
print suggest('flaot')

示例#32

0

显示文件

 def test_spelling_numbers(self):
   self.assertEqual(en.suggest("42"), [("42", 1.0)])
   self.assertEqual(en.suggest("3.1415"), [("3.1415", 1.0)])

示例#33

0

显示文件

文件： resources.py 项目： bathurstAi/NLP

    def clean_text(self,
                   text,
                   short=True,
                   length=True,
                   contra=True,
                   remove_stopwords=True,
                   lemmatize=True,
                   english=True,
                   ngrams=False,
                   spelling=False,
                   spCy=False,
                   stem=False):
        """
            Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings
            check spelling, lemmatize and compare with wordnet corpus for english words
            Paras:
                text: text data to clean
                remove_stopwords: if true, remove stop words  text to reduce noise
                lemmatize: if true lemmatizes word
                english: if true compares w/ wordnet corpus to keep only english words
                ngrams: if true creates ngrams 
            Returns:
                text: cleaned text data
        """
        if contra:
            text = [
                self.remove_contractions(word)
                for word in sent_tokenize(text.lower())
            ]
            text = " ".join(text)

            text = re.sub(r'https?:\/\/.*[\r\n]*',
                          '',
                          text,
                          flags=re.MULTILINE)
            text = re.sub(r'\<a href', ' ', text)
            text = re.sub(r'&amp;', '', text)
            text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
            text = re.sub(r'<br />', ' ', text)
            text = re.sub(r'\'', ' ', text)
            text = re.sub(r'[^a-zA-Z]', " ", text)

        if length:
            #text = text.split()
            text = self.reduce_lengthening(text)  # for w in text]
        #text = " ".join(text)

        if spelling:
            text = text.split()
            word = suggest(text)
            max_word = max(word[:][1])
            #for w in self.token_nize(text)]
            text = " ".join(max_word[0])

        if remove_stopwords:
            text = text.split()
            stops = stopwords.words("english")
            newStopWords = ['please', "name", "thank"]
            stops.extend(newStopWords)
            text = [w for w in text if not w in stops]
            text = " ".join(text)

        if short:
            text = ' '.join([w for w in text.split() if len(w) > 3])

        if lemmatize:
            text_sent = nltk.word_tokenize(text)
            text = [
                WordNetLemmatizer().lemmatize(w, self.getWordnetPos(w))
                for w in text_sent
            ]
            text = " ".join(text)

        if stem:
            text = self.portStem(text.split())
            #text = " ".join(text)

##### Attempt at using Spacy Lemmtization ##############
# Spacy Lemmtization
#        if spCy:
#            text = " ".join(self.spCy(text))

##### Use of WordNet ######
        if english:
            text = ' '.join([w for w in text.split() if wordnet.synsets(w)])

#### Creation of Ngrams ######
        if ngrams:
            text = text.split()
            text = self.nGrams(text)
            #text = " ".join(text)

        return text

示例#34

0

显示文件

#a/an via pattern.en doesn't work wel...
#only switch from an to a, not vice versa...?
from pattern.en import article, suggest
import helpers as h

stories = []
with open('compareresults', 'r') as f:
    for line in f:
        line = line.strip()
        if line and line[-1] in "1234567890":
            stories.append(h.strip(line[:line.rfind(' ')]).split(' '))

sps = []
for s in stories:
    for w in s:
        sp = suggest(w)
        if len(sp) > 1:
            sps.append([w] + sp)


def fixaan(l):
    for i in range(len(l)):
        if l[i] == 'a' or l[i] == 'an':
            newa = article(l[i + 1])
            if newa == 'an':
                print l, i
            elif newa == 'a' and newa != l[i]:
                print l, i
                l[i] = newa

示例#35

0

显示文件

文件： resources.py 项目： bathurstAi/NLP

 def corr(self, text):
     word = suggest(text)
     max_word = max(word[:][1])
     return max_word[0]

示例#36

0

显示文件

文件： test_en.py 项目： NaturalLanguage/pattern

 def test_spelling_oneletter_words(self):
     self.assertEqual(en.suggest("I"), [("I", 1.0)])
     self.assertEqual(en.suggest("a"), [("a", 1.0)])