def __init__(self): lock.acquire() self.lemmatizer = wn_stem.WordNetLemmatizer() lock.release() self.lemmas_dict = {} self.synsets_dict = {} self.similarity_dict = {}
def text_normalization(text): # Приведение текста к нижнему регистру text = str(text).lower() # Удаление ненужных символов spl_char_text = re.sub(r'[^ a-z]', '', text) # Создание токенов слов tokens = nltk.word_tokenize(spl_char_text) # Инициализация лемматизации lema = wordnet.WordNetLemmatizer() # Определение частей речи tags_list = pos_tag(tokens, None) lema_words = [] for token, pos_token in tags_list: #Глагол if pos_token.startswith('V'): pos_val = 'v' #Прилагательное elif pos_token.startswith('J'): pos_val = 'a' #Наречие elif pos_token.startswith('R'): pos_val = 'r' #Существительное else: pos_val = 'n' lema_token = lema.lemmatize(token, pos_val) #Добавление лемматизированного слова в список lema_words.append(lema_token) return " ".join(lema_words)
def lemmatization(texts, allowed_postags, top_tags, stop_words=stop_words): ''' It keeps the lemma of the words (lemma is the uninflected form of a word), and deletes the underired POS tags Parameters: texts (list): text to lemmatize allowed_postags (list): list of allowed postags, like NOUN, ADL, VERB, ADV ''' lemma = wordnet.WordNetLemmatizer() doc = nlp(texts) texts_out = [] top_tags = top_tags for token in doc: if str(token) in top_tags: texts_out.append(str(token)) elif token.pos_ in allowed_postags: if token.lemma_ not in ['-PRON-']: texts_out.append(token.lemma_) else: texts_out.append('') texts_out = ' '.join(texts_out) return texts_out
def text_normalize(text): global train_counter if train_counter % 10000 == 0: print( str(train_counter) + " sets lemmatized..., " + "Time now: " + str(datetime.now())) train_counter += 1 text = str(text).lower() spl_char_text = re.sub(r'[^ a-z]', '', text) tokens = nltk.word_tokenize(spl_char_text) lema = wordnet.WordNetLemmatizer() tags_list = pos_tag(tokens, tagset=None) lema_words = [] for token, pos_token in tags_list: if pos_token.startswith('V'): pos_value = 'v' elif pos_token.startswith('J'): pos_value = 'a' elif pos_token.startswith('R'): pos_value = 'r' else: pos_value = 'n' lema_token = lema.lemmatize(token, pos_value) lema_words.append(lema_token) return " ".join(lema_words)
def __init__(self): #Ensuring that the wordnet corpus is loaded, so we can support multithreading wn.ensure_loaded() self.lemmatizer = wn_stem.WordNetLemmatizer() self.lemmas_dict = {} self.synsets_dict = {} self.similarity_dict = {}
def registry(key): """ retrieves objects given keys from config """ if key is None: return None elif key == 'wordnet': return wordnet.WordNetLemmatizer() elif key == 'porter': return PorterStemmer()
def clean_text(text: str, stopwords: List[str]) -> List[str]: text = re.sub(r"[\"\(\)]", " ", text).lower() text = re.sub(r"[\-\_]", "", text) lem = wordnet.WordNetLemmatizer() if not isinstance(stopwords, set): stopwords = set(stopwords) return [ lem.lemmatize(w) for w in nltk.word_tokenize(text) if (w not in stopwords and not re.match(r"^.*[^a-zA-Z].*$", w)) ]
def create_lemma_line(self, input_line): ''' We create the lemmatizer object ''' lemma = wordnet.WordNetLemmatizer() # This is an array for the current line that we will append values to line = [] for token, ttype in input_line: checks = ["a", "v", "r", "n"] if(ttype[0].lower() not in checks): ttype = "n" line.append(lemma.lemmatize(token, ttype[0].lower())) return {"Lemmas": " ".join(line)}
def __init__( self, lang=lf.LangFeatures.LANG_EN, # Choice of stemmer type only applies to english stemmer_type=TYPE_PORTER_STEMMER): self.lang = lf.LangFeatures.map_to_lang_code_iso639_1(lang_code=lang) self.stemmer_type = stemmer_type # 바보 nltk is broken, https://stackoverflow.com/questions/38916452/nltk-download-ssl-certificate-verify-failed # TODO Write our own Lemmatizer Ssl.disable_ssl_check() if lang not in Lemmatizer.SUPPORTED_LANGUAGES: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Stemmer for language "' + str(lang) + '" not supported.' lg.Log.warning(errmsg) raise Exception(errmsg) else: lg.Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Stemmer for lang "' + str(lang) + '" ok' ) self.stemmer = None if self.lang == lf.LangFeatures.LANG_EN: if self.stemmer_type == Lemmatizer.TYPE_WORDNET_LEMMATIZER: nltk.download('wordnet') self.stemmer = wordnet.WordNetLemmatizer() elif self.stemmer_type == Lemmatizer.TYPE_PORTER_STEMMER: self.stemmer = porter.PorterStemmer() elif self.stemmer_type == Lemmatizer.TYPE_SNOWBALL_STEMMER: self.stemmer = snowball.SnowballStemmer(language='english') else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ':Unrecognized stemmer type "' + str(self.stemmer_type) + '".') # Call once, because only the first one is slow self.stem(word='initialize') elif self.lang == lf.LangFeatures.LANG_KO: self.stemmer = LemmatizerKorean() elif self.lang == lf.LangFeatures.LANG_RU: self.stemmer = self.stemmer = snowball.SnowballStemmer( language='russian') else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unsupported language "' + str(self.lang) + '"') return
def getNouns(tagged, lemma): tokenized = tagged.split() nouns = [] for i in range(len(tokenized)): noun = re.findall(r'(\S*)/N', tokenized[i]) if len(noun) == 1: try: lmtz = wn.WordNetLemmatizer().lemmatize(noun[0], 'n') if lmtz == lemma: tag = re.findall(r'%s\/(\w*)' % noun[0], tokenized[i]) nouns.append((noun[0], i + 1, tag[0])) except UnicodeDecodeError: print 'LEMMATIZER ERROR: ' + noun[0] return nouns
def lemmatization(texts, allowed_postags, stop_words=stop_words): lemma = wordnet.WordNetLemmatizer() doc = nlp(texts) texts_out = [] for token in doc: if str(token) in top_tags.values: texts_out.append(str(token)) elif token.pos_ in allowed_postags: if token.lemma_ not in ['-PRON-']: texts_out.append(token.lemma_) else: texts_out.append('') texts_out = ' '.join(texts_out) return texts_out
def activate(self, *args, **kwargs): self._stopwords = stopwords.words('english') self._wnlemma = wordnet.WordNetLemmatizer() self._syntactics = {'N': 'n', 'V': 'v', 'J': 'a', 'S': 's', 'R': 'r'} local_path = os.environ.get("SENPY_DATA") self._categories = { 'anger': [ 'general-dislike', ], 'fear': [ 'negative-fear', ], 'disgust': [ 'shame', ], 'joy': ['gratitude', 'affective', 'enthusiasm', 'love', 'joy', 'liking'], 'sadness': [ 'ingrattitude', 'daze', 'humility', 'compassion', 'despair', 'anxiety', 'sadness' ] } self._wnaffect_mappings = { 'anger': 'anger', 'fear': 'negative-fear', 'disgust': 'disgust', 'joy': 'joy', 'sadness': 'sadness' } self._load_emotions(self.find_file(self.hierarchy_path)) if 'total_synsets' not in self.sh: total_synsets = self._load_synsets( self.find_file(self.synsets_path)) self.sh['total_synsets'] = total_synsets self._total_synsets = self.sh['total_synsets'] self._wn16_path = self.wn16_path self._wn16 = WordNetCorpusReader( self.find_file(self._wn16_path), nltk.data.find(self.find_file(self._wn16_path)))
def activate(self, *args, **kwargs): nltk.download(['stopwords', 'averaged_perceptron_tagger', 'wordnet']) self._stopwords = stopwords.words('english') self._wnlemma = wordnet.WordNetLemmatizer() self._syntactics = {'N': 'n', 'V': 'v', 'J': 'a', 'S': 's', 'R': 'r'} local_path = os.path.dirname(os.path.abspath(__file__)) self._categories = { 'anger': [ 'general-dislike', ], 'fear': [ 'negative-fear', ], 'disgust': [ 'shame', ], 'joy': ['gratitude', 'affective', 'enthusiasm', 'love', 'joy', 'liking'], 'sadness': [ 'ingrattitude', 'daze', 'humility', 'compassion', 'despair', 'anxiety', 'sadness' ] } self._wnaffect_mappings = { 'anger': 'anger', 'fear': 'negative-fear', 'disgust': 'disgust', 'joy': 'joy', 'sadness': 'sadness' } self._load_emotions(local_path + self.hierarchy_path) if 'total_synsets' not in self.sh: total_synsets = self._load_synsets(local_path + self.synsets_path) self.sh['total_synsets'] = total_synsets self._total_synsets = self.sh['total_synsets'] self._wn16_path = self.wn16_path self._wn16 = WordNetCorpusReader( os.path.abspath("{0}".format(local_path + self._wn16_path)), nltk.data.find(local_path + self._wn16_path))
def text_normalize(text): text = str(text).lower() spl_char_text = re.sub(r'[^ a-z]', '', text) tokens = nltk.word_tokenize(spl_char_text) lema = wordnet.WordNetLemmatizer() tags_list = pos_tag(tokens, tagset=None) lema_words = [] for token, pos_token in tags_list: if pos_token.startswith('V'): pos_value = 'v' elif pos_token.startswith('J'): pos_value = 'a' elif pos_token.startswith('R'): pos_value = 'r' else: pos_value = 'n' lema_token = lema.lemmatize(token, pos_value) lema_words.append(lema_token) return " ".join(lema_words)
def stopword_(text): tag_list = pos_tag(nltk.word_tokenize(text), tagset=None) stop = stopwords.words('english') lema = wordnet.WordNetLemmatizer() lema_word = [] for token, pos_token in tag_list: if token in stop: continue if pos_token.startswith('V'): pos_val = 'v' elif pos_token.startswith('J'): pos_val = 'a' elif pos_token.startswith('R'): pos_val = 'r' else: pos_val = 'n' lema_token = lema.lemmatize(token, pos_val) lema_word.append(lema_token) return " ".join(lema_word)
def text_normalization(text: str) -> str: text = str(text).lower() char_text = re.sub(r'[^ a-z]', '', text) tokens = word_tokenize(char_text) lemma = wordnet.WordNetLemmatizer() tags_list = pos_tag(tokens) lemma_words = [] for token, pos_token in tags_list: if pos_token.startswith('V'): pos_val = 'v' elif pos_token.startswith('J'): pos_val = 'a' elif pos_token.startswith('R'): pos_val = 'r' else: pos_val = 'n' lemma_token = lemma.lemmatize(token, pos_val) lemma_words.append(lemma_token) return ' '.join(lemma_words)
def text_normalization(text): text = str(text).lower() # text to lower case spl_char_text = re.sub(r'[^ a-z]', '', text) # removing special characters tokens = nltk.word_tokenize(spl_char_text) # word tokenizing lema = wordnet.WordNetLemmatizer() # initializing lemmatization tags_list = pos_tag(tokens, tagset=None) # parts of speech lema_words = [] # empty list for token, pos_token in tags_list: if pos_token.startswith('V'): # verb pos_val = 'v' elif pos_token.startswith('J'): # adjective pos_val = 'a' elif pos_token.startswith('R'): # adverb pos_val = 'r' else: pos_val = 'n' # noun lema_token = lema.lemmatize(token, pos_val) # performing lemmatization lema_words.append( lema_token) # appending the lemmatized token into a list return " ".join(lema_words) # return the lemmatized as a sentence
def nltk_cleaning(text): token_text = word_tokenize(text) clean_text = ["UNK"] lemma = wordnet.WordNetLemmatizer() tag_list = pos_tag(token_text, tagset=None) for token, pos_token in tag_list: if token not in '\n\n \n\n\n!"-#$%&()--.*''+,-/:;``<=>[``?@[\\]^_`''{|}~\t\n`\'\'' and (token not in stopwords): if pos_token.startswith('V'): # Verb pos_val='v' elif pos_token.startswith('J'): # Adjective pos_val='a' elif pos_token.startswith('R'): # Adverb pos_val='r' else: pos_val='n' # Noun lemma_token= lemma.lemmatize(token,pos_val) clean_text.append(lemma_token.lower()) else: continue return " ".join(clean_text)
def text_normalization(dataset): text = str(dataset).lower() #convert input to lowercase spl_char_text = re.sub(r'[^a-z0-9]', ' ', text) #exclude special characters, etc. tokens = nltk.word_tokenize(spl_char_text) #word tokenizing lemma = wordnet.WordNetLemmatizer() #initialize Lemmatizer tags_list = pos_tag(tokens, tagset=None) #the parts of speech of every word lemma_words = [] for token, pos_token in tags_list: if pos_token.startswith('V'): #verb pos_val = 'v' elif pos_token.startswith('J'): #adjective pos_val = 'a' elif pos_token.startswith('R'): #adverb pos_val = 'r' else: pos_val = 'n' #noun lemma_token = lemma.lemmatize(token, pos_val) #perform lemmatization lemma_words.append(lemma_token) #append lemmatized token into a list return (" ".join(lemma_words)) #return lemmatized tokens as a sentence
def fillWordBags(): stopWords = set(corpus.stopwords.words('english')) lmtzr = wordnet.WordNetLemmatizer() db = connectDB() rows = query(db, 'select id, content from article where wordbag is null') sql = '' for i, row in enumerate(rows): wordbag = collections.Counter( lmtzr.lemmatize(word).lower() for word in tkn.word_tokenize(row['content']) if word.isalnum() and word.lower() not in stopWords ) sql += "update article set wordbag = '%s' where id = %s;\n" \ % (json.dumps(wordbag), row['id']) if i % 100 == 0: print(i) execute(db, sql) sql = '' execute(db, sql)
def __init__(self): self.punct = list(punctuation) + ['``', '\'\'', '...'] self.remove_list = [[ 'could', 'said', 'would', 'told', 'say', 'tell', 'use', 'used', 'mr', 'mrs' ], [ 'POS', 'PRP', 'PRP$', 'IN', 'TO', 'CC', 'DT', 'EX', 'LS', 'PDT', 'RP', 'UH', 'CD' ]] self.replace_list = { '\'s': 'is', '\'re': 'are', '\'m': 'am', '\'ll': 'will', '\'ve': 'have', 'n\'t': 'not', '\'d': 'had' } self.topmod_list = [ 'NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBN', 'VBP', 'VBZ' ] self.lemmatizer = wordnet.WordNetLemmatizer()
def text_normalization(txt): txt = str(txt).lower() #tokenizer = RegexpTokenizer(r'\w+') clean_txt = re.sub(r'[^a-z]', ' ', txt) #remove special char tokens = word_tokenize(clean_txt) #print(tokens) lema = wordnet.WordNetLemmatizer() tags_list = pos_tag(tokens, tagset=None) lema_words = [] #pprint(tags_list) for token, pos_t in tags_list: pos_val = '' if (pos_t.startswith('V')): pos_val = 'v' elif (pos_t.startswith('J')): pos_val = 'a' elif (pos_t.startswith('R')): pos_val = 'r' else: pos_val = 'n' lema_token = lema.lemmatize(token, pos_val) lema_words.append(lema_token) scenten_with_stopword = " ".join(lema_words) return stopword_removing(scenten_with_stopword)
def __init__(self): super(EnglishWordNetLemmatizer, self).__init__() self._lemmatizer = wordnet.WordNetLemmatizer()
faq.isnull().sum() faq.shape[0] faq = faq.rename(columns={'Question': 'Context', 'Answer': 'Text Response'}) df = pd.concat([df, faq], ignore_index=True) """ # word tokenizing s = 'tell me about your personality' words = word_tokenize(s) lemma = wordnet.WordNetLemmatizer() # intializing lemmatizer lemma.lemmatize('absorbed', pos='v') pos_tag(nltk.word_tokenize(s), tagset=None) # returns the parts of speech of every word # function that performs text normalization steps def text_normalization(text): text = str(text).lower() # text to lower case spl_char_text = re.sub(r'[^ a-z]', '', text) # removing special characters tokens = nltk.word_tokenize(spl_char_text) # word tokenizing lema = wordnet.WordNetLemmatizer() # intializing lemmatization tags_list = pos_tag(tokens, tagset=None) # parts of speech lema_words = [] # empty list
type=float, help='Number of hypothesis pairs to evaluate') parser.add_argument('-b', '--beta', default=3.0, type=float, help='Number of hypothesis pairs to evaluate') parser.add_argument('-g', '--gamma', default=0.5, type=float, help='Number of hypothesis pairs to evaluate') opts = parser.parse_args() cachedStopWords = stopwords.words("english") wnlemma = wn.WordNetLemmatizer() ngram_dict = {} def wn_contains(word, ref): synonyms = wdn.synsets(''.join(word)) synset = set(chain.from_iterable([word.lemma_names() for word in synonyms])) refset = set([''.join(r) for r in ref]) result = bool(synset & refset) return result # check intersection of sets def levenshtein(s1, s2): if len(s1) < len(s2): return levenshtein(s2, s1)
from __future__ import print_function from nltk.stem import PorterStemmer, LancasterStemmer, wordnet word_list = { 'runner': 'n', 'running': 'v', 'ran': 'v', 'scientist': 'n', 'science': 'n', 'Maltese': 'a', } porter = PorterStemmer() lancaster = LancasterStemmer() lemmatiser = wordnet.WordNetLemmatizer() for word, pos in word_list.items(): print(word, end=' ') print(porter.stem(word), end=' ') print(lancaster.stem(word), end=' ') print(lemmatiser.lemmatize(word, pos=pos), end=' ') print()
import pandas as pd import nltk import numpy as np import re import random from nltk.stem import wordnet #lemmatization from nltk import pos_tag from nltk import word_tokenize from nltk.corpus import stopwords from sklearn.feature_extraction.text import CountVectorizer #bow from sklearn.feature_extraction.text import TfidfVectorizer #tfidf from sklearn.metrics import pairwise_distances #cosine sim lema = wordnet.WordNetLemmatizer() def text_lemmatize(text): text_lower = str(text).lower() #to lower text_clean = re.sub(r'[^ a-z0-9]', '', text_lower) #cleaning replacement(text_clean, dict_replacement) #simplification tokens = nltk.wordpunct_tokenize(text_clean) #tokenizing tokens_and_tags = pos_tag(tokens, tagset=None) #pairs word-pos lemas_of_words = [] for token, tag in tokens_and_tags: if tag.startswith('V'): #verb new_tag = 'v' elif tag.startswith('J'): #adjective new_tag = 'a' elif tag.startswith('R'): #adverb new_tag = 'r'
def getVerb(tagged, dep, noun, index): nsubj = re.findall(r'nsubj\((\w*)-[0-9]*, %s-%d\)' % (noun, index), dep) nobj = re.findall(r'nsubj\(%s-%d, (\w*)-[0-9]*\)' % (noun, index), dep) nsubjpass = re.findall(r'nsubjpass\((\w*)-[0-9]*, %s-%d\)' % (noun, index), dep) dobj = re.findall(r'dobj\((\w*)-[0-9]*, %s-%d\)' % (noun, index), dep) iobj = re.findall(r'iobj\((\w*)-[0-9]*, %s-%d\)' % (noun, index), dep) comp = re.findall(r'compound\((\w*)-([0-9]*), %s-%d\)' % (noun, index), dep) xcomp = re.findall(r'xcomp\((\w*)-[0-9]*, %s-%d\)' % (noun, index), dep) ccomp = re.findall(r'ccomp\((\w*)-[0-9]*, %s-%d\)' % (noun, index), dep) #handles cases where noun is the subject of the verb if len(nsubj) >= 1: stype = getTag(tagged, nsubj[0]) #handles the copula case, in which the parser uses a non-verb(esp. adjectives) in the nsubj instead of the base verb if stype not in verbtag: verb = re.findall(r'cop\(%s-[0-9]*, (\w*)-[0-9]*\)' % nsubj[0], dep) verb += re.findall(r'cop\(%s-%d, (\w*)-[0-9]*\)' % (noun, index), dep) if len(verb) >= 1: vtag = getTag(tagged, verb[0]) else: verb = [''] vtag = '' #handles the gerund case, in which the parser returns the gerund of the vp rather than the base verb elif stype == 'VBG': verb = re.findall(r'aux\(%s-[0-9]*, (\w*)-[0-9]*\)' % nsubj[0], dep) if len(verb) >= 1: neg = re.findall(r'neg\(%s-[0-9]*, (\w*)-[0-9]*\)' % nsubj[0], dep) vtag = getTag(tagged, verb[0]) vlemma = wn.WordNetLemmatizer().lemmatize(verb[0], 'v') return verb[0], vtag, 'subject', neg, vlemma else: verb = [''] vtag = '' #all other cases else: verb = nsubj vtag = stype neg = re.findall(r'neg\(%s-[0-9]*, (\w*)-[0-9]*\)' % verb[0], dep) vlemma = wn.WordNetLemmatizer().lemmatize(verb[0], 'v') return verb[0], vtag, 'subject', neg, vlemma if len(nobj) >= 1: stype = getTag(tagged, nobj[0]) #handles the copula case, in which the parser uses a non-verb(esp. adjectives) in the nsubj instead of the base verb if stype not in verbtag: verb = re.findall(r'cop\(%s-[0-9]*, (\w*)-[0-9]*\)' % nobj[0], dep) verb += re.findall(r'cop\(%s-%d, (\w*)-[0-9]*\)' % (noun, index), dep) if len(verb) >= 1: vtag = getTag(tagged, verb[0]) else: verb = [''] vtag = '' #handles the gerund case, in which the parser returns the gerund of the vp rather than the base verb elif stype == 'VBG': verb = re.findall(r'aux\(%s-[0-9]*, (\w*)-[0-9]*\)' % nobj[0], dep) neg = re.findall(r'neg\(%s-[0-9]*, (\w*)-[0-9]*\)' % nobj[0], dep) if len(verb) >= 1: vtag = getTag(tagged, verb[0]) vlemma = wn.WordNetLemmatizer().lemmatize(verb[0], 'v') return verb[0], vtag, 'object', neg, vlemma else: verb = [''] vtag = '' #all other cases else: verb = nobj vtag = stype neg = re.findall(r'neg\(%s-[0-9]*, (\w*)-[0-9]*\)' % verb[0], dep) vlemma = wn.WordNetLemmatizer().lemmatize(verb[0], 'v') return verb[0], vtag, 'object', neg, vlemma elif len(nsubjpass) >= 1: vtag = getTag(tagged, nsubjpass[0]) neg = re.findall(r'neg\(%s-[0-9]*, (\w*)-[0-9]*\)' % nsubjpass[0], dep) vlemma = wn.WordNetLemmatizer().lemmatize(nsubjpass[0], 'v') return nsubjpass[0], vtag, 'subject', neg, vlemma #handles cases where noun is the object of the verb elif len(dobj) >= 1: vtag = getTag(tagged, dobj[0]) neg = re.findall(r'neg\(%s-[0-9]*, (\w*)-[0-9]*\)' % dobj[0], dep) vlemma = wn.WordNetLemmatizer().lemmatize(dobj[0], 'v') return dobj[0], vtag, 'object', neg, vlemma elif len(iobj) >= 1: vtag = getTag(tagged, iobj[0]) neg = re.findall(r'neg\(%s-[0-9]*, (\w*)-[0-9]*\)' % iobj[0], dep) vlemma = wn.WordNetLemmatizer().lemmatize(iobj[0], 'v') return iobj[0], vtag, 'object', neg, vlemma elif len(xcomp) >= 1: vtag = getTag(tagged, xcomp[0]) neg = re.findall(r'neg\(%s-[0-9]*, (\w*)-[0-9]*\)' % xcomp[0], dep) vlemma = wn.WordNetLemmatizer().lemmatize(xcomp[0], 'v') return xcomp[0], vtag, 'object', neg, vlemma elif len(ccomp) >= 1: vtag = getTag(tagged, ccomp[0]) neg = re.findall(r'neg\(%s-[0-9]*, (\w*)-[0-9]*\)' % ccomp[0], dep) vlemma = wn.WordNetLemmatizer().lemmatize(ccomp[0], 'v') return ccomp[0], vtag, 'object', neg, vlemma #handles compound case where noun modifies another noun (that is either the subject or object of the verb) elif len(comp) >= 1: verbtup = getVerb(tagged, dep, comp[0][0], int(comp[0][1])) return verbtup[0], verbtup[1], verbtup[2], verbtup[3], verbtup[4] else: return '', '', '', '', ''
import graphviz as gv import nltk import csv import webbrowser import codecs import nltk.stem.wordnet as wn from nltk.parse.stanford import StanfordDependencyParser as sdp lemmatizer = wn.WordNetLemmatizer() dependency_parser = sdp( path_to_jar="stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0.jar", path_to_models_jar= "stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0-models.jar") # Constants VERB = ['VB', 'VBP', 'VBD', 'VBZ'] NOUN = ['NN', 'NNS', 'VBG', "NNP", "NNPS"] def load_file(filename="input.txt"): """ loads a text file into a string :param filename: name of file to read :return: string content of file """ with codecs.open(filename, "r", "utf-8") as f: return f.read() def strip_parens(text):
""" Stems all tokens in the input tokenized text :param tokenized_text: The tokenized text :return: The tokenized text with stemmed words """ return [__stemmer__.stem(token) for token in tokenized_text] def lemmatize_text(tokenized_text): """ Lemmatizes all tokens in the input tokenized text :param tokenized_text: The tokenized text :return: The tokenized text with lemmatized words """ return [__lemmatizer__.lemmatize(token) for token in tokenized_text] #region Private # Locally initialized stop words (optimization) __stop_words__ = co.stopwords.words('english') # Locally initialized stemmer (optimization) __stemmer__ = po.PorterStemmer() # Locally initialized lemmatizer (optimization) __lemmatizer__ = wo.WordNetLemmatizer() #endregion