示例#1
0
 def __init__(self):
     lock.acquire()
     self.lemmatizer = wn_stem.WordNetLemmatizer()
     lock.release()
     self.lemmas_dict = {}
     self.synsets_dict = {}
     self.similarity_dict = {}
示例#2
0
def text_normalization(text):
    # Приведение текста к нижнему регистру
    text = str(text).lower()
    # Удаление ненужных символов
    spl_char_text = re.sub(r'[^ a-z]', '', text)
    # Создание токенов слов
    tokens = nltk.word_tokenize(spl_char_text)
    # Инициализация лемматизации
    lema = wordnet.WordNetLemmatizer()
    # Определение частей речи
    tags_list = pos_tag(tokens, None)
    lema_words = []
    for token, pos_token in tags_list:
        #Глагол
        if pos_token.startswith('V'):
            pos_val = 'v'
        #Прилагательное
        elif pos_token.startswith('J'):
            pos_val = 'a'
        #Наречие
        elif pos_token.startswith('R'):
            pos_val = 'r'
        #Существительное
        else:
            pos_val = 'n'
        lema_token = lema.lemmatize(token, pos_val)
        #Добавление лемматизированного слова в список
        lema_words.append(lema_token)

    return " ".join(lema_words)
def lemmatization(texts, allowed_postags, top_tags, stop_words=stop_words):
    ''' It keeps the lemma of the words (lemma is the uninflected form of a word),
        and deletes the underired POS tags

        Parameters:

        texts (list): text to lemmatize
        allowed_postags (list): list of allowed postags, like NOUN, ADL, VERB, ADV
        '''

    lemma = wordnet.WordNetLemmatizer()
    doc = nlp(texts)
    texts_out = []
    top_tags = top_tags

    for token in doc:

        if str(token) in top_tags:
            texts_out.append(str(token))

        elif token.pos_ in allowed_postags:

            if token.lemma_ not in ['-PRON-']:
                texts_out.append(token.lemma_)

            else:
                texts_out.append('')

    texts_out = ' '.join(texts_out)

    return texts_out
示例#4
0
def text_normalize(text):
    global train_counter
    if train_counter % 10000 == 0:
        print(
            str(train_counter) + " sets lemmatized..., " + "Time now: " +
            str(datetime.now()))
    train_counter += 1
    text = str(text).lower()
    spl_char_text = re.sub(r'[^ a-z]', '', text)
    tokens = nltk.word_tokenize(spl_char_text)
    lema = wordnet.WordNetLemmatizer()
    tags_list = pos_tag(tokens, tagset=None)
    lema_words = []
    for token, pos_token in tags_list:
        if pos_token.startswith('V'):
            pos_value = 'v'
        elif pos_token.startswith('J'):
            pos_value = 'a'
        elif pos_token.startswith('R'):
            pos_value = 'r'
        else:
            pos_value = 'n'
        lema_token = lema.lemmatize(token, pos_value)
        lema_words.append(lema_token)
    return " ".join(lema_words)
示例#5
0
    def __init__(self):
        #Ensuring that the wordnet corpus is loaded, so we can support multithreading
        wn.ensure_loaded()

        self.lemmatizer = wn_stem.WordNetLemmatizer()
        self.lemmas_dict = {}
        self.synsets_dict = {}
        self.similarity_dict = {}
示例#6
0
def registry(key):
    """
    retrieves objects given keys from config
    """
    if key is None:
        return None
    elif key == 'wordnet':
        return wordnet.WordNetLemmatizer()
    elif key == 'porter':
        return PorterStemmer()
示例#7
0
def clean_text(text: str, stopwords: List[str]) -> List[str]:
    text = re.sub(r"[\"\(\)]", " ", text).lower()
    text = re.sub(r"[\-\_]", "", text)
    lem = wordnet.WordNetLemmatizer()
    if not isinstance(stopwords, set):
        stopwords = set(stopwords)
    return [
        lem.lemmatize(w) for w in nltk.word_tokenize(text)
        if (w not in stopwords and not re.match(r"^.*[^a-zA-Z].*$", w))
    ]
示例#8
0
 def create_lemma_line(self, input_line):
     ''' We create the lemmatizer object '''
     lemma = wordnet.WordNetLemmatizer()
     # This is an array for the current line that we will append values to
     line = []
     for token, ttype in input_line:
         checks = ["a", "v", "r", "n"]
         if(ttype[0].lower() not in checks):
             ttype = "n"
         line.append(lemma.lemmatize(token, ttype[0].lower()))
     return {"Lemmas": " ".join(line)}
示例#9
0
文件: Lemmatizer.py 项目: nwae/nwae
    def __init__(
        self,
        lang=lf.LangFeatures.LANG_EN,
        # Choice of stemmer type only applies to english
        stemmer_type=TYPE_PORTER_STEMMER):
        self.lang = lf.LangFeatures.map_to_lang_code_iso639_1(lang_code=lang)
        self.stemmer_type = stemmer_type

        # 바보 nltk is broken, https://stackoverflow.com/questions/38916452/nltk-download-ssl-certificate-verify-failed
        # TODO Write our own Lemmatizer
        Ssl.disable_ssl_check()

        if lang not in Lemmatizer.SUPPORTED_LANGUAGES:
            errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                     + ': Stemmer for language "' + str(lang) + '" not supported.'
            lg.Log.warning(errmsg)
            raise Exception(errmsg)
        else:
            lg.Log.info(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                + ': Stemmer for lang "' + str(lang) + '" ok'
            )

        self.stemmer = None

        if self.lang == lf.LangFeatures.LANG_EN:
            if self.stemmer_type == Lemmatizer.TYPE_WORDNET_LEMMATIZER:
                nltk.download('wordnet')
                self.stemmer = wordnet.WordNetLemmatizer()
            elif self.stemmer_type == Lemmatizer.TYPE_PORTER_STEMMER:
                self.stemmer = porter.PorterStemmer()
            elif self.stemmer_type == Lemmatizer.TYPE_SNOWBALL_STEMMER:
                self.stemmer = snowball.SnowballStemmer(language='english')
            else:
                raise Exception(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ':Unrecognized stemmer type "' + str(self.stemmer_type) +
                    '".')
            # Call once, because only the first one is slow
            self.stem(word='initialize')
        elif self.lang == lf.LangFeatures.LANG_KO:
            self.stemmer = LemmatizerKorean()
        elif self.lang == lf.LangFeatures.LANG_RU:
            self.stemmer = self.stemmer = snowball.SnowballStemmer(
                language='russian')
        else:
            raise Exception(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Unsupported language "' + str(self.lang) + '"')

        return
示例#10
0
def getNouns(tagged, lemma):
    tokenized = tagged.split()
    nouns = []
    for i in range(len(tokenized)):
        noun = re.findall(r'(\S*)/N', tokenized[i])
        if len(noun) == 1:
            try:
                lmtz = wn.WordNetLemmatizer().lemmatize(noun[0], 'n')
                if lmtz == lemma:
                    tag = re.findall(r'%s\/(\w*)' % noun[0], tokenized[i])
                    nouns.append((noun[0], i + 1, tag[0]))
            except UnicodeDecodeError:
                print 'LEMMATIZER ERROR: ' + noun[0]
    return nouns
示例#11
0
    def lemmatization(texts, allowed_postags, stop_words=stop_words):
        lemma = wordnet.WordNetLemmatizer()
        doc = nlp(texts)
        texts_out = []
        for token in doc:
            if str(token) in top_tags.values:
                texts_out.append(str(token))
            elif token.pos_ in allowed_postags:
                if token.lemma_ not in ['-PRON-']:
                    texts_out.append(token.lemma_)
                else:
                    texts_out.append('')
        texts_out = ' '.join(texts_out)

        return texts_out
    def activate(self, *args, **kwargs):

        self._stopwords = stopwords.words('english')
        self._wnlemma = wordnet.WordNetLemmatizer()
        self._syntactics = {'N': 'n', 'V': 'v', 'J': 'a', 'S': 's', 'R': 'r'}
        local_path = os.environ.get("SENPY_DATA")
        self._categories = {
            'anger': [
                'general-dislike',
            ],
            'fear': [
                'negative-fear',
            ],
            'disgust': [
                'shame',
            ],
            'joy':
            ['gratitude', 'affective', 'enthusiasm', 'love', 'joy', 'liking'],
            'sadness': [
                'ingrattitude', 'daze', 'humility', 'compassion', 'despair',
                'anxiety', 'sadness'
            ]
        }

        self._wnaffect_mappings = {
            'anger': 'anger',
            'fear': 'negative-fear',
            'disgust': 'disgust',
            'joy': 'joy',
            'sadness': 'sadness'
        }

        self._load_emotions(self.find_file(self.hierarchy_path))

        if 'total_synsets' not in self.sh:
            total_synsets = self._load_synsets(
                self.find_file(self.synsets_path))
            self.sh['total_synsets'] = total_synsets

        self._total_synsets = self.sh['total_synsets']

        self._wn16_path = self.wn16_path
        self._wn16 = WordNetCorpusReader(
            self.find_file(self._wn16_path),
            nltk.data.find(self.find_file(self._wn16_path)))
示例#13
0
    def activate(self, *args, **kwargs):

        nltk.download(['stopwords', 'averaged_perceptron_tagger', 'wordnet'])
        self._stopwords = stopwords.words('english')
        self._wnlemma = wordnet.WordNetLemmatizer()
        self._syntactics = {'N': 'n', 'V': 'v', 'J': 'a', 'S': 's', 'R': 'r'}
        local_path = os.path.dirname(os.path.abspath(__file__))
        self._categories = {
            'anger': [
                'general-dislike',
            ],
            'fear': [
                'negative-fear',
            ],
            'disgust': [
                'shame',
            ],
            'joy':
            ['gratitude', 'affective', 'enthusiasm', 'love', 'joy', 'liking'],
            'sadness': [
                'ingrattitude', 'daze', 'humility', 'compassion', 'despair',
                'anxiety', 'sadness'
            ]
        }

        self._wnaffect_mappings = {
            'anger': 'anger',
            'fear': 'negative-fear',
            'disgust': 'disgust',
            'joy': 'joy',
            'sadness': 'sadness'
        }

        self._load_emotions(local_path + self.hierarchy_path)

        if 'total_synsets' not in self.sh:
            total_synsets = self._load_synsets(local_path + self.synsets_path)
            self.sh['total_synsets'] = total_synsets

        self._total_synsets = self.sh['total_synsets']

        self._wn16_path = self.wn16_path
        self._wn16 = WordNetCorpusReader(
            os.path.abspath("{0}".format(local_path + self._wn16_path)),
            nltk.data.find(local_path + self._wn16_path))
示例#14
0
def text_normalize(text):
    text = str(text).lower()
    spl_char_text = re.sub(r'[^ a-z]', '', text)
    tokens = nltk.word_tokenize(spl_char_text)
    lema = wordnet.WordNetLemmatizer()
    tags_list = pos_tag(tokens, tagset=None)
    lema_words = []
    for token, pos_token in tags_list:
        if pos_token.startswith('V'):
            pos_value = 'v'
        elif pos_token.startswith('J'):
            pos_value = 'a'
        elif pos_token.startswith('R'):
            pos_value = 'r'
        else:
            pos_value = 'n'
        lema_token = lema.lemmatize(token, pos_value)
        lema_words.append(lema_token)
    return " ".join(lema_words)
示例#15
0
def stopword_(text):
    tag_list = pos_tag(nltk.word_tokenize(text), tagset=None)
    stop = stopwords.words('english')
    lema = wordnet.WordNetLemmatizer()
    lema_word = []
    for token, pos_token in tag_list:
        if token in stop:
            continue
        if pos_token.startswith('V'):
            pos_val = 'v'
        elif pos_token.startswith('J'):
            pos_val = 'a'
        elif pos_token.startswith('R'):
            pos_val = 'r'
        else:
            pos_val = 'n'
        lema_token = lema.lemmatize(token, pos_val)
        lema_word.append(lema_token)
    return " ".join(lema_word)
示例#16
0
def text_normalization(text: str) -> str:
    text = str(text).lower()
    char_text = re.sub(r'[^ a-z]', '', text)
    tokens = word_tokenize(char_text)
    lemma = wordnet.WordNetLemmatizer()
    tags_list = pos_tag(tokens)
    lemma_words = []
    for token, pos_token in tags_list:
        if pos_token.startswith('V'):
            pos_val = 'v'
        elif pos_token.startswith('J'):
            pos_val = 'a'
        elif pos_token.startswith('R'):
            pos_val = 'r'
        else:
            pos_val = 'n'
        lemma_token = lemma.lemmatize(token, pos_val)
        lemma_words.append(lemma_token)
    return ' '.join(lemma_words)
示例#17
0
def text_normalization(text):
    text = str(text).lower()  # text to lower case
    spl_char_text = re.sub(r'[^ a-z]', '', text)  # removing special characters
    tokens = nltk.word_tokenize(spl_char_text)  # word tokenizing
    lema = wordnet.WordNetLemmatizer()  # initializing lemmatization
    tags_list = pos_tag(tokens, tagset=None)  # parts of speech
    lema_words = []  # empty list
    for token, pos_token in tags_list:
        if pos_token.startswith('V'):  # verb
            pos_val = 'v'
        elif pos_token.startswith('J'):  # adjective
            pos_val = 'a'
        elif pos_token.startswith('R'):  # adverb
            pos_val = 'r'
        else:
            pos_val = 'n'  # noun
        lema_token = lema.lemmatize(token, pos_val)  # performing lemmatization
        lema_words.append(
            lema_token)  # appending the lemmatized token into a list
    return " ".join(lema_words)  # return the lemmatized as a sentence
示例#18
0
def nltk_cleaning(text):
  token_text = word_tokenize(text)
  clean_text = ["UNK"]
  lemma = wordnet.WordNetLemmatizer()
  tag_list = pos_tag(token_text, tagset=None)
  for token, pos_token in tag_list:
   if token not in '\n\n \n\n\n!"-#$%&()--.*''+,-/:;``<=>[``?@[\\]^_`''{|}~\t\n`\'\'' and (token not in stopwords):
     if pos_token.startswith('V'):  # Verb
         pos_val='v'
     elif pos_token.startswith('J'): # Adjective
         pos_val='a'
     elif pos_token.startswith('R'): # Adverb
         pos_val='r'
     else:
         pos_val='n' # Noun
     lemma_token= lemma.lemmatize(token,pos_val)
     clean_text.append(lemma_token.lower())
   else:
      continue 
  return " ".join(clean_text)
示例#19
0
def text_normalization(dataset):
    text = str(dataset).lower()  #convert input to lowercase
    spl_char_text = re.sub(r'[^a-z0-9]', ' ',
                           text)  #exclude special characters, etc.
    tokens = nltk.word_tokenize(spl_char_text)  #word tokenizing
    lemma = wordnet.WordNetLemmatizer()  #initialize Lemmatizer
    tags_list = pos_tag(tokens,
                        tagset=None)  #the parts of speech of every word
    lemma_words = []
    for token, pos_token in tags_list:
        if pos_token.startswith('V'):  #verb
            pos_val = 'v'
        elif pos_token.startswith('J'):  #adjective
            pos_val = 'a'
        elif pos_token.startswith('R'):  #adverb
            pos_val = 'r'
        else:
            pos_val = 'n'  #noun
        lemma_token = lemma.lemmatize(token, pos_val)  #perform lemmatization
        lemma_words.append(lemma_token)  #append lemmatized token into a list
    return (" ".join(lemma_words))  #return lemmatized tokens as a sentence
示例#20
0
def fillWordBags():
    stopWords = set(corpus.stopwords.words('english'))
    lmtzr = wordnet.WordNetLemmatizer()
    db = connectDB()
    rows = query(db, 'select id, content from article where wordbag is null')
    sql = ''

    for i, row in enumerate(rows):
        wordbag = collections.Counter(
                lmtzr.lemmatize(word).lower()
                for word in tkn.word_tokenize(row['content'])
                if word.isalnum() and word.lower() not in stopWords
            )

        sql += "update article set wordbag = '%s' where id = %s;\n" \
                % (json.dumps(wordbag), row['id'])

        if i % 100 == 0:
            print(i)
            execute(db, sql)
            sql = ''

    execute(db, sql)
示例#21
0
 def __init__(self):
     self.punct = list(punctuation) + ['``', '\'\'', '...']
     self.remove_list = [[
         'could', 'said', 'would', 'told', 'say', 'tell', 'use', 'used',
         'mr', 'mrs'
     ],
                         [
                             'POS', 'PRP', 'PRP$', 'IN', 'TO', 'CC', 'DT',
                             'EX', 'LS', 'PDT', 'RP', 'UH', 'CD'
                         ]]
     self.replace_list = {
         '\'s': 'is',
         '\'re': 'are',
         '\'m': 'am',
         '\'ll': 'will',
         '\'ve': 'have',
         'n\'t': 'not',
         '\'d': 'had'
     }
     self.topmod_list = [
         'NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBN', 'VBP', 'VBZ'
     ]
     self.lemmatizer = wordnet.WordNetLemmatizer()
示例#22
0
def text_normalization(txt):
    txt = str(txt).lower()
    #tokenizer = RegexpTokenizer(r'\w+')
    clean_txt = re.sub(r'[^a-z]', ' ', txt)  #remove special char
    tokens = word_tokenize(clean_txt)
    #print(tokens)
    lema = wordnet.WordNetLemmatizer()
    tags_list = pos_tag(tokens, tagset=None)
    lema_words = []
    #pprint(tags_list)
    for token, pos_t in tags_list:
        pos_val = ''
        if (pos_t.startswith('V')):
            pos_val = 'v'
        elif (pos_t.startswith('J')):
            pos_val = 'a'
        elif (pos_t.startswith('R')):
            pos_val = 'r'
        else:
            pos_val = 'n'
        lema_token = lema.lemmatize(token, pos_val)
        lema_words.append(lema_token)
    scenten_with_stopword = " ".join(lema_words)
    return stopword_removing(scenten_with_stopword)
  def __init__(self):
    super(EnglishWordNetLemmatizer, self).__init__()

    self._lemmatizer = wordnet.WordNetLemmatizer()
示例#24
0
faq.isnull().sum()

faq.shape[0]

faq = faq.rename(columns={'Question': 'Context', 'Answer': 'Text Response'})

df = pd.concat([df, faq], ignore_index=True)
"""

# word tokenizing

s = 'tell me about your personality'
words = word_tokenize(s)

lemma = wordnet.WordNetLemmatizer()  # intializing lemmatizer
lemma.lemmatize('absorbed', pos='v')

pos_tag(nltk.word_tokenize(s),
        tagset=None)  # returns the parts of speech of every word

# function that performs text normalization steps


def text_normalization(text):
    text = str(text).lower()  # text to lower case
    spl_char_text = re.sub(r'[^ a-z]', '', text)  # removing special characters
    tokens = nltk.word_tokenize(spl_char_text)  # word tokenizing
    lema = wordnet.WordNetLemmatizer()  # intializing lemmatization
    tags_list = pos_tag(tokens, tagset=None)  # parts of speech
    lema_words = []  # empty list
示例#25
0
                    type=float,
                    help='Number of hypothesis pairs to evaluate')
parser.add_argument('-b',
                    '--beta',
                    default=3.0,
                    type=float,
                    help='Number of hypothesis pairs to evaluate')
parser.add_argument('-g',
                    '--gamma',
                    default=0.5,
                    type=float,
                    help='Number of hypothesis pairs to evaluate')
opts = parser.parse_args()

cachedStopWords = stopwords.words("english")
wnlemma = wn.WordNetLemmatizer()
ngram_dict = {}


def wn_contains(word, ref):
    synonyms = wdn.synsets(''.join(word))
    synset = set(chain.from_iterable([word.lemma_names()
                                      for word in synonyms]))
    refset = set([''.join(r) for r in ref])
    result = bool(synset & refset)
    return result  # check intersection of sets


def levenshtein(s1, s2):
    if len(s1) < len(s2):
        return levenshtein(s2, s1)
from __future__ import print_function

from nltk.stem import PorterStemmer, LancasterStemmer, wordnet

word_list = {
    'runner': 'n',
    'running': 'v',
    'ran': 'v',
    'scientist': 'n',
    'science': 'n',
    'Maltese': 'a',
}

porter = PorterStemmer()
lancaster = LancasterStemmer()
lemmatiser = wordnet.WordNetLemmatizer()

for word, pos in word_list.items():
    print(word, end=' ')
    print(porter.stem(word), end=' ')
    print(lancaster.stem(word), end=' ')
    print(lemmatiser.lemmatize(word, pos=pos), end=' ')
    print()
示例#27
0
import pandas as pd
import nltk
import numpy as np
import re
import random
from nltk.stem import wordnet  #lemmatization
from nltk import pos_tag
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer  #bow
from sklearn.feature_extraction.text import TfidfVectorizer  #tfidf
from sklearn.metrics import pairwise_distances  #cosine sim

lema = wordnet.WordNetLemmatizer()


def text_lemmatize(text):
    text_lower = str(text).lower()  #to lower
    text_clean = re.sub(r'[^ a-z0-9]', '', text_lower)  #cleaning
    replacement(text_clean, dict_replacement)  #simplification
    tokens = nltk.wordpunct_tokenize(text_clean)  #tokenizing
    tokens_and_tags = pos_tag(tokens, tagset=None)  #pairs word-pos
    lemas_of_words = []

    for token, tag in tokens_and_tags:
        if tag.startswith('V'):  #verb
            new_tag = 'v'
        elif tag.startswith('J'):  #adjective
            new_tag = 'a'
        elif tag.startswith('R'):  #adverb
            new_tag = 'r'
示例#28
0
def getVerb(tagged, dep, noun, index):
    nsubj = re.findall(r'nsubj\((\w*)-[0-9]*, %s-%d\)' % (noun, index), dep)
    nobj = re.findall(r'nsubj\(%s-%d, (\w*)-[0-9]*\)' % (noun, index), dep)
    nsubjpass = re.findall(r'nsubjpass\((\w*)-[0-9]*, %s-%d\)' % (noun, index),
                           dep)
    dobj = re.findall(r'dobj\((\w*)-[0-9]*, %s-%d\)' % (noun, index), dep)
    iobj = re.findall(r'iobj\((\w*)-[0-9]*, %s-%d\)' % (noun, index), dep)
    comp = re.findall(r'compound\((\w*)-([0-9]*), %s-%d\)' % (noun, index),
                      dep)
    xcomp = re.findall(r'xcomp\((\w*)-[0-9]*, %s-%d\)' % (noun, index), dep)
    ccomp = re.findall(r'ccomp\((\w*)-[0-9]*, %s-%d\)' % (noun, index), dep)
    #handles cases where noun is the subject of the verb
    if len(nsubj) >= 1:
        stype = getTag(tagged, nsubj[0])
        #handles the copula case, in which the parser uses a non-verb(esp. adjectives) in the nsubj instead of the base verb
        if stype not in verbtag:
            verb = re.findall(r'cop\(%s-[0-9]*, (\w*)-[0-9]*\)' % nsubj[0],
                              dep)
            verb += re.findall(r'cop\(%s-%d, (\w*)-[0-9]*\)' % (noun, index),
                               dep)
            if len(verb) >= 1:
                vtag = getTag(tagged, verb[0])
            else:
                verb = ['']
                vtag = ''
        #handles the gerund case, in which the parser returns the gerund of the vp rather than the base verb
        elif stype == 'VBG':
            verb = re.findall(r'aux\(%s-[0-9]*, (\w*)-[0-9]*\)' % nsubj[0],
                              dep)
            if len(verb) >= 1:
                neg = re.findall(r'neg\(%s-[0-9]*, (\w*)-[0-9]*\)' % nsubj[0],
                                 dep)
                vtag = getTag(tagged, verb[0])
                vlemma = wn.WordNetLemmatizer().lemmatize(verb[0], 'v')
                return verb[0], vtag, 'subject', neg, vlemma
            else:
                verb = ['']
                vtag = ''
        #all other cases
        else:
            verb = nsubj
            vtag = stype
        neg = re.findall(r'neg\(%s-[0-9]*, (\w*)-[0-9]*\)' % verb[0], dep)
        vlemma = wn.WordNetLemmatizer().lemmatize(verb[0], 'v')
        return verb[0], vtag, 'subject', neg, vlemma
    if len(nobj) >= 1:
        stype = getTag(tagged, nobj[0])
        #handles the copula case, in which the parser uses a non-verb(esp. adjectives) in the nsubj instead of the base verb
        if stype not in verbtag:
            verb = re.findall(r'cop\(%s-[0-9]*, (\w*)-[0-9]*\)' % nobj[0], dep)
            verb += re.findall(r'cop\(%s-%d, (\w*)-[0-9]*\)' % (noun, index),
                               dep)
            if len(verb) >= 1:
                vtag = getTag(tagged, verb[0])
            else:
                verb = ['']
                vtag = ''
        #handles the gerund case, in which the parser returns the gerund of the vp rather than the base verb
        elif stype == 'VBG':
            verb = re.findall(r'aux\(%s-[0-9]*, (\w*)-[0-9]*\)' % nobj[0], dep)
            neg = re.findall(r'neg\(%s-[0-9]*, (\w*)-[0-9]*\)' % nobj[0], dep)
            if len(verb) >= 1:
                vtag = getTag(tagged, verb[0])
                vlemma = wn.WordNetLemmatizer().lemmatize(verb[0], 'v')
                return verb[0], vtag, 'object', neg, vlemma
            else:
                verb = ['']
                vtag = ''
        #all other cases
        else:
            verb = nobj
            vtag = stype
        neg = re.findall(r'neg\(%s-[0-9]*, (\w*)-[0-9]*\)' % verb[0], dep)
        vlemma = wn.WordNetLemmatizer().lemmatize(verb[0], 'v')
        return verb[0], vtag, 'object', neg, vlemma
    elif len(nsubjpass) >= 1:
        vtag = getTag(tagged, nsubjpass[0])
        neg = re.findall(r'neg\(%s-[0-9]*, (\w*)-[0-9]*\)' % nsubjpass[0], dep)
        vlemma = wn.WordNetLemmatizer().lemmatize(nsubjpass[0], 'v')
        return nsubjpass[0], vtag, 'subject', neg, vlemma
    #handles cases where noun is the object of the verb
    elif len(dobj) >= 1:
        vtag = getTag(tagged, dobj[0])
        neg = re.findall(r'neg\(%s-[0-9]*, (\w*)-[0-9]*\)' % dobj[0], dep)
        vlemma = wn.WordNetLemmatizer().lemmatize(dobj[0], 'v')
        return dobj[0], vtag, 'object', neg, vlemma
    elif len(iobj) >= 1:
        vtag = getTag(tagged, iobj[0])
        neg = re.findall(r'neg\(%s-[0-9]*, (\w*)-[0-9]*\)' % iobj[0], dep)
        vlemma = wn.WordNetLemmatizer().lemmatize(iobj[0], 'v')
        return iobj[0], vtag, 'object', neg, vlemma
    elif len(xcomp) >= 1:
        vtag = getTag(tagged, xcomp[0])
        neg = re.findall(r'neg\(%s-[0-9]*, (\w*)-[0-9]*\)' % xcomp[0], dep)
        vlemma = wn.WordNetLemmatizer().lemmatize(xcomp[0], 'v')
        return xcomp[0], vtag, 'object', neg, vlemma
    elif len(ccomp) >= 1:
        vtag = getTag(tagged, ccomp[0])
        neg = re.findall(r'neg\(%s-[0-9]*, (\w*)-[0-9]*\)' % ccomp[0], dep)
        vlemma = wn.WordNetLemmatizer().lemmatize(ccomp[0], 'v')
        return ccomp[0], vtag, 'object', neg, vlemma
    #handles compound case where noun modifies another noun (that is either the subject or object of the verb)
    elif len(comp) >= 1:
        verbtup = getVerb(tagged, dep, comp[0][0], int(comp[0][1]))
        return verbtup[0], verbtup[1], verbtup[2], verbtup[3], verbtup[4]
    else:
        return '', '', '', '', ''
示例#29
0
import graphviz as gv
import nltk
import csv
import webbrowser
import codecs
import nltk.stem.wordnet as wn
from nltk.parse.stanford import StanfordDependencyParser as sdp

lemmatizer = wn.WordNetLemmatizer()

dependency_parser = sdp(
    path_to_jar="stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0.jar",
    path_to_models_jar=
    "stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0-models.jar")

# Constants
VERB = ['VB', 'VBP', 'VBD', 'VBZ']
NOUN = ['NN', 'NNS', 'VBG', "NNP", "NNPS"]


def load_file(filename="input.txt"):
    """ loads a text file into a string
    
    :param filename: name of file to read
    :return: string content of file
    """
    with codecs.open(filename, "r", "utf-8") as f:
        return f.read()


def strip_parens(text):
示例#30
0
    """
    Stems all tokens in the input tokenized text
    :param tokenized_text: The tokenized text
    :return: The tokenized text with stemmed words
    """

    return [__stemmer__.stem(token) for token in tokenized_text]


def lemmatize_text(tokenized_text):
    """
    Lemmatizes all tokens in the input tokenized text
    :param tokenized_text: The tokenized text
    :return: The tokenized text with lemmatized words
    """

    return [__lemmatizer__.lemmatize(token) for token in tokenized_text]


#region Private

# Locally initialized stop words (optimization)
__stop_words__ = co.stopwords.words('english')

# Locally initialized stemmer (optimization)
__stemmer__ = po.PorterStemmer()

# Locally initialized lemmatizer (optimization)
__lemmatizer__ = wo.WordNetLemmatizer()

#endregion