Exemplos de search em Python, exemplos de nltk.re.search em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: Bootstrapper.py Projeto: cpalenmichel/Master-Thesis

 def find_npnp_patterns(self, wordpair, doc):
     # TODO Make an easier switch from head and full NP?
     # check each sentence and find both words in word pair.
     # if they are both in the sentence, then create a pattern.
     # Creating pattern: if adjacent, <X><Y> or <Y><X>, if not then <X>blahblah<Y> where X and Y are NPs.
     # X and Y are NPs but we want to extract the heads of X and Y.
     # X = anaphor, y = antecedent
     ret = []
     for s in doc.sentences:
         tokens = [tok.token for tok in s.words]
         if wordpair.anaphor.token in tokens and wordpair.antecedent.token in tokens:
             pattern_str1 = '(' + wordpair.anaphor.token + ')(.*)(' + wordpair.antecedent.token + ')'
             pattern_str2 = '(' + wordpair.antecedent.token + ')(.*)(' + wordpair.anaphor.token + ')'
             pattern1 = re.compile(pattern_str1)
             pattern2 = re.compile(pattern_str2)
             sent_str = ' '.join([w.token for w in s.words])
             match1 = re.search(pattern1, sent_str)
             match2 = re.search(pattern2, sent_str)
             if match1:
                 print(match1.group(1, 2, 3))
                 ret.append(NPNP('<X>(' + match1.group(2) + ')<Y>'))
             if match2:
                 print(match2.group(1, 2, 3))
                 ret.append(NPNP('<Y>(' + match2.group(2) + ')<X>'))
     return ret

Exemplo n.º 2

0

Exibir arquivo

def findCurrency(text):
    """ Display information about the found strings """

    symbols = "$£eurospoundsdollars"

    # Iterate through each item in text and find all strings matching a regular
    # expression to find all amounts of money
    for i in text:
        matches = re.findall('((?:(?:\$|£)(?:\d+)(?:\.?\d*,?\d{1,3})(?:bn|m)?)|'\
        '(?:(?:\d+)(?:\.?,?\d)*(?:bn|m)?(?: ?euros?| ?dollars?| ?pounds?| ?p)))',\
         i, re.IGNORECASE)

        # If a match is found, check the currency and amount, print
        if matches:
            for m in matches:
                if re.search('\$|dollars?', m, re.IGNORECASE):
                    currency = "Dollar"
                if re.search('\£|pounds?|p', m, re.IGNORECASE):
                    currency = "Pound"
                if re.search('euros?', m, re.IGNORECASE): currency = "Euro"

                amount = m.strip(symbols)

                print("Found a match!" + "\nCurrency:", currency, "\nAmount:",\
                 amount, "\n")

Exemplo n.º 3

0

Exibir arquivo

def prob(w1, w2):
    # prob = count(w1 | w2) / count(w1)
    key = str(w1) + " " + str(w2)
    count_w1_w2 = [w for w in sents if re.search(key, w)]
    key = str(w1)
    count_w1 = [w for w in sents if re.search(key, w)]

    return len(count_w1_w2) / float(len(count_w1))

Exemplo n.º 4

0

Exibir arquivo

Arquivo: Csv.py Projeto: musielena/ChangeMyView-Project

def write_word_comment_position(data_p, input_word, input_re, register):
    with open(
            "data/positions/" + input_word.replace(" ", "_") + "_" + register +
            ".csv", "w+") as raw:
        writer = csv.DictWriter(raw, fieldnames=fieldnames)
        writer.writeheader()

        for thread in data_p:
            for comment in thread[register]:
                last_sentence = 1
                sentence_count = len(comment['text_sentences'])
                for paragraph_i in range(0, len(comment['text_paragraphs'])):
                    sentence_tokenized = sent_tokenize(
                        comment['text_paragraphs'][paragraph_i])
                    for sentence_i in range(0, len(sentence_tokenized)):
                        if re.search(input_re, sentence_tokenized[sentence_i]):
                            sent_pos = last_sentence + sentence_i
                            if sent_pos > sentence_count:
                                sent_pos = sentence_count
                            writer.writerow({
                                'sentence_position':
                                sent_pos,
                                'sentence_count':
                                sentence_count,
                                'paragraph_position':
                                sentence_i + 1,
                                'paragraph_length':
                                len(sentence_tokenized)
                            })
                    last_sentence += len(sentence_tokenized)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: LemmaTokenizer.py Projeto: josejuanmartinez/news_classifier

 def __call__(self, articles):
     lemmas = list()
     for t in word_tokenize(articles):
         lemma = self.wnl.lemmatize(t)
         if lemma not in stop_words.ENGLISH_STOP_WORDS and re.search(r'^[a-zA-Z]+$', lemma) is not None:
             lemmas.append(lemma)
         lemmas.append(lemma)
     return lemmas

Exemplo n.º 6

0

Exibir arquivo

def tokenize(text):
    """ Remove all punctuation and return tokens for a string """
    from nltk import word_tokenize, re

    text_tokens = [
        word for word in word_tokenize(text) if re.search("\w", word)
    ]
    return text_tokens

Exemplo n.º 7

0

Exibir arquivo

 def get_text(self, url):
     html = requests.get(url).text
     soup = BeautifulSoup(html, 'html.parser')
     content = ''
     for item in soup.find_all('div', {'id': 'content'}):
         for text in item.find_all(text=True):
             if re.search('▶', text) is not None:
                 break
             content = content + text + "\n\n"
     return content

Exemplo n.º 8

0

Exibir arquivo

def Select_Keywords_And_Phrases(tagged_data):
    # Variables used for keyword/phrase selection.
    tag_weightings = {"title": 5, "meta": 5, "h": 3, "contents": 1}
    stop_words = list(stopwords.words('english'))
    grammar = """
            NBAR:
                {<NN.*|JJ>*<NN.*>}
            NP:
                {<NBAR>}
                {<NBAR><IN><NBAR>}
            """

    # Variables to store information for selection.
    keywords = []
    key_phrases = []
    word_frequency = {}
    phrase_frequency = {}

    # Iterate through the pairs of data, keeping track of the weighting modifier.
    for pair in tagged_data:
        tag = pair[0]
        data = pair[1]
        weighted_value = tag_weightings[tag]

        # If meta or title data, add it to the keyphrases/keywords lists.
        if (tag in ["title", "meta"]):
            if (len(data) == 1):
                keywords.append(Lemmatize(data[0][0], data[0][1]))
            else:
                words = []
                for word, tag in data:
                    words.append(Lemmatize(word, tag))
                key_phrase = " ".join(words)
                if (key_phrase not in key_phrases):
                    key_phrases.append(key_phrase)

        # Use POS patterns to find noun phrases, then add weighted values to phrase frequency.
        chunker = nltk.RegexpParser(grammar)
        result = chunker.parse(data)
        for subtree in result.subtrees():
            if ((subtree.label() == "NP")):
                leaves = []
                noun_phrase = ""
                if (1 < len(subtree.leaves()) and len(subtree.leaves()) < 5):
                    for word, tag in subtree.leaves():
                        leaves.append(Lemmatize(word, tag))
                    noun_phrase = " ".join(leaves)
                    modifier = 1
                    if (len(subtree.leaves()) == 3):
                        modifier = weighted_value * 3
                    if (noun_phrase in phrase_frequency):
                        phrase_frequency[noun_phrase] += 1.0 * modifier
                    else:
                        phrase_frequency[noun_phrase] = 1.0 * modifier

        # Ignore stop-words, check for selected POS tag, then add weighted values to word frequency.
        no_punctuation_data = [(word.lower(), tag) for word, tag in data
                               if re.search("\w", word)]
        for word_tag_pair in no_punctuation_data:
            if ((word_tag_pair[0] not in stop_words)
                    and (word_tag_pair[1][0:2] in ["NN", "VB", "JJ"])):
                lemma = Lemmatize(word_tag_pair[0], word_tag_pair[1])
                if (lemma in word_frequency):
                    word_frequency[lemma] += 1.0 * weighted_value
                else:
                    word_frequency[lemma] = 1.0 * weighted_value

    # Find values from the word frequencies to select keywords.
    total = 0.0
    count = 0
    for key, value in word_frequency.items():
        if (value > 1):
            count += 1
            total += value
    average = total / count
    cap = count / average

    # Select keywords from the frequencies.
    for key, value in word_frequency.items():
        if (average < value and value < cap):
            if (len(key) > 3):
                if (key not in keywords):
                    keywords.append(key)

    # Select key phrases from the frequencies.
    tracker = 0
    for key, value in reversed(
            sorted(phrase_frequency.items(), key=operator.itemgetter(1))):
        if (tracker < 7):
            if (key not in key_phrases):
                key_phrases.append(key)
            tracker += 1
        else:
            break

    return keywords, key_phrases

Exemplo n.º 9

0

Exibir arquivo

This is a temporary script file.
"""

import nltk
from nltk import re, word_tokenize, FreqDist, MLEProbDist, probability

#open an document
f1 = open('a01_data\sampledata.txt')
dataRaw = f1.read()
f2 = open('a01_data\sampledata.vocab.txt')
vocabRaw = f2.read()

# calculate the frequence distribution
dataRaw_tokens_nopunct = [
    word for word in word_tokenize(dataRaw) if re.search("\w", word)
]

for elem in dataRaw_tokens_nopunct:
    if elem == 's':
        dataRaw_tokens_nopunct.remove(elem)
for elem in dataRaw_tokens_nopunct:
    if elem == '/s':
        dataRaw_tokens_nopunct.remove(elem)
dataRaw_fdist = FreqDist(dataRaw_tokens_nopunct)
##xx = dataRaw_fdist.most_common()
vocabRaw_tokens_nopunct = [
    word for word in word_tokenize(vocabRaw) if re.search("\w", word)
]

# calculate the possibility distribution

Exemplo n.º 10

0

Exibir arquivo

Arquivo: InformationRetrievalParser.py Projeto: omarraja786/InformationRetrieval

def processURL(url):
    global count
    global wordList

    #html page parsing
    name = url
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html,features="html.parser")
    #remove javascript and css style from the parsed text.
    for js in soup(["script", "style"]):
        js.decompose()

    count+=1
    text = soup.get_text()

    
    #process parsed text
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    text = '\n'.join(chunk for chunk in chunks if chunk)
    #after each part of the processing, produces a file output for each document
    f = open('File '+ str(count) + ' HTML parsing output '+'.txt', 'w')
    f.write(text)
    print("File " + str(count) + " HTML Parsing output.txt saved")

    #Sentence splitting,tokenization and normalization process
    #tokenization
    tokens = word_tokenize(text)
    
    #remove punctuation
    tokens_nopunct = [word.lower() for word in tokens if re.search("\w",word)]
    f = open('File '+ str(count) + ' SS,Tokenization,Normalization output'+'.txt', 'w')
    f.write(str(tokens_nopunct))
    print("File " + str(count) + " SS,Tokenization,Normalization output.txt saved")

    #Stemming (Reduce a word to its word stem that affixes to suffixes and prefixes (roots))
    tokens_nopunct = [stem(word) for word in tokens_nopunct]
    f = open('File '+ str(count) + ' Stemming output'+'.txt', 'w')
    f.write(str(tokens_nopunct))
    print("File " + str(count) + " Stemming output.txt saved")

    #remove stopwords
    #https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
    stop_words = set(stopwords.words('english'))
    filtered = [w for w in tokens_nopunct if not w in stop_words]
    totalNumberOfWords = len(filtered)
    f = open('File '+ str(count) + ' Stopwords Removed'+'.txt', 'w')
    f.write(str(filtered))
    print("File " + str(count) + " Stopwords Removed.txt saved")

    
    
    #postagging
    tagged = nltk.pos_tag(filtered)
    length = len(tagged)
    f = open('File '+ str(count) + ' PosTagging output'+'.txt', 'w')
    f.write(str(tagged))
    wordList.append(filtered)
    print("File " + str(count) + " PosTagging output.txt saved")


    #count occurences of each word in document
    ##https://stackoverflow.com/questions/2600191/how-can-i-count-the-occurrences-of-a-list-item
    counts = [(i, len(list(c))) for i,c in groupby(sorted(flatten(tagged)))]
    f = open('File '+ str(count) + ' Count Output'+'.txt', 'w')
    f.write(str(counts))
    print("File " + str(count) + " Count Output.txt saved")
    
    return filtered

Exemplo n.º 11

0

Exibir arquivo

Arquivo: old_parser.py Projeto: hallelhel/Search_Engine

    def text_operation(self, text):
        """
        This function takes a list of tokenizes and manipulate every token to his case
        param: text: list of tokens.
        :return: the text after parser.
        """
        len_text = len(text)
        tokenAfterParse = []
        counter = -1
        for term in text:
            counter = counter + 1
            if self.per == True:
                self.per = False
                continue
            if self.per2 == True:
                self.per = True
                self.per2 = False
                continue
            if term == " " or term == '':
                continue

            if term[-1] in string.punctuation or ord(term[-1]) < 48 or ord(
                    term[-1]
            ) > 127:  # to remove anything that is not a word or number
                if term[-1] != '%':
                    while term[-1] in string.punctuation or ord(
                            term[-1]) < 48 or ord(term[-1]) > 127:
                        term = term[:-1]
                        if term == "":
                            break
                if term == "":
                    continue
                #text[counter] = term FIXME happen in line 160

            ##new- remove emoji in middle of term
            term = ''.join(
                [l for l in term if ord(l) < 127 and ord(l) > 34]
            )  # remove every unneccery part in term, add ascii between 35 to 126
            if len(term) < 2:
                continue
            text[counter] = term

            ##new
            # hashtag & tags cases:
            if term[0] in string.punctuation:
                if term[0] == '#' and len(term) > 2:
                    # if len(term) == 2: FIXME why to add hashatgs with len 2?
                    #     tokenAfterParse.append(term)
                    #     tokenAfterParse.append(term[1])
                    #   continue
                    words = self.hashtag_tokenize(
                        term[1:]
                    )  # this func split the words and add the original hashtag with lower case to words
                    tokenAfterParse.extend(words)
                    continue
                elif term[0] != '@':
                    while term[0] in string.punctuation:
                        term = term[1:]
                        if len(term) < 2:
                            break
                    text[counter] = term
            # if ord(term[0]) > 127 or term[0] in string.punctuation:  # to remove anything that is not a word or number # maybe we need while
            #     if term[0] == '#' and len(term) > 1:
            #         if len(term) == 2:
            #             tokenAfterParse.append(term)
            #             tokenAfterParse.append(term[1])
            #             continue
            #         words = self.hashtag_tokenize(term[1:])
            #         tokenAfterParse.extend(words)
            #         # tokenAfterParse.append(term.lower())
            #         continue
            #     elif term[0] != '@':
            #         while ord(term[0]) > 127 or term[0] in string.punctuation:
            #             term = term[1:]
            #             if term == "":
            #                 break
            #         if term == "":
            #             continue
            #         text[counter] = term
            # url case:
            if "http" in term:
                # if ord(term[-1]) == 8230: FIXME no need
                #     continue
                term = term[term.find('http'):].strip()
                urls = self.url_Opretion(term)
                tokenAfterParse.extend(urls)
                continue

            # number cases - dates/percentage:
            if (term.isdigit() or
                    term[0].isdigit()) and not (re.search('[a-zA-Z]', term)):
                if counter + 1 < len_text and term.isdigit():
                    if text[counter + 1] in self.month_dict:  # Date
                        tokenAfterParse.append(
                            self.Date_Toknize(term, text, counter, len_text))
                        continue
                    if text[counter + 1] == "percent" or text[
                            counter + 1] == "percentage" or text[
                                counter + 1] == "Percent" or text[
                                    counter + 1] == "Percentage":  # %
                        new_word = term + text[counter + 1]
                        tokenAfterParse.append(new_word)
                        self.per = True
                        continue
                new_number = self.numbeOpertion(term, text, counter, len_text)
                tokenAfterParse.append(new_number)
                continue

            # try to minimize the covid terms
            if term.startswith('covid') or term.startswith(
                    'Covid') or term.startswith('COVID'):
                tokenAfterParse.append('covid19')
                continue

            # check entity
            if counter + 1 < len_text:
                if term[0].isupper() and text[
                        counter + 1][0].isupper():  # words with big letter
                    name = self.entity(text, counter, len_text)
                    tokenAfterParse.append(name)
                    tokenAfterParse.append(term)
                    continue
                if term in self.month_dict and text[counter + 1].isdigit():
                    tokenAfterParse.append(
                        self.Date_Toknize(term, text, counter, len_text))
                    continue
            # replace every number from one to ten to digits:
            elif term in self.dict_numbers.keys():
                term = self.dict_numbers[term]

            tokenAfterParse.append(term)
        return tokenAfterParse

Exemplo n.º 12

0

Exibir arquivo

Arquivo: Task_4.py Projeto: ishwarvenugopal/Natural_Language_Engineering

#Members: Ishwar Venugopal [1906084], Shreya Jadhav [1702121])

#Task 4: Listing out the top 10 similar pairs

from nltk.corpus import wordnet
import nltk
from nltk import re
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')
from itertools import product
import operator
import pandas as pd

data=open("text1.txt",encoding="utf8").read()
tokens=nltk.word_tokenize(data)
tokens_nopunct=[word for word in tokens if re.search("\w",word)]
tokens_lower=[x.lower() for x in tokens_nopunct]
lemmatizer = WordNetLemmatizer() 
tokens_lem=[lemmatizer.lemmatize(x) for x in tokens_lower]
vocab=set(tokens_lem)

sims = []

for word1 in vocab:
    for word2 in vocab:
        w1 = wordnet.synsets(word1)
        w2 = wordnet.synsets(word2)
        if w1 and w2:
            val=w1[0].path_similarity(w2[0])
            sims.append((word1, word2, val))

Exemplo n.º 13

0

Exibir arquivo

def valid_sents(sents, key):
    global valid_sent
    valid_sent = [w for w in sents if re.search(key, w)]

Exemplo n.º 14

0

Exibir arquivo

import nltk
from nltk import word_tokenize
from nltk import re
from nltk.book import *

#-----------------------------Ahora con moby dick

moby_tokens = text1.tokens
moby_tokens_sin_puntos = [
    palabra.lower() for palabra in moby_tokens if re.search("\w", palabra)
]
#Numero de tokens en moby dick (sin signos de puntuación)
nro_tokens_sin_puntos = len(moby_tokens_sin_puntos)
print("1) Numero de tokens en Moby Dick: ", nro_tokens_sin_puntos)
#Numero de palabras unicas o nro de TYPES ****
nro_types = len(set(moby_tokens_sin_puntos))
print("2) Numero de TYPES en Moby Dick: ", nro_types)
#Type token ratio de Moby Dick
moby_type_token_ratio = nro_types / nro_tokens_sin_puntos
print("3) Type token ratio de Moby Dick: ", moby_type_token_ratio)

#----------------------------Ahora el WSJ

wsj_tokens = text7.tokens
wsj_tokens_sin_puntos = [
    palabra.lower() for palabra in wsj_tokens if re.search("\w", palabra)
]
nro_tokens_wsj_sin_puntos = len(wsj_tokens_sin_puntos)
nro_types_wsj = len(set(wsj_tokens_sin_puntos))

wsj_type_token_ratio = nro_types_wsj / nro_tokens_wsj_sin_puntos

Exemplo n.º 15

0

Exibir arquivo

# Importing packages for tokenisation and lemmatization from nltk

import nltk

nltk.download('punkt')
from nltk import word_tokenize
from nltk import re
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

tokens = word_tokenize(text)  #tokenizing the obtained text

tokens_nopunct = [word for word in tokens
                  if re.search("\w", word)]  #removing all punctuations
tokens_updated = tokens_nopunct  #Updated list of tokens

print("********************\n")
print("*** Lowercasing ***\n")
print("The number of tokens before lowercase:", len(tokens_updated))
print("The number of types before lowercase", len(set(tokens_updated)))
print("\n")

tokens_lower = [x.lower()
                for x in tokens_updated]  #converting all tokens to lowercase
tokens_updated = tokens_lower

print("The number of tokens after lowercase:", len(tokens_updated))
print("The number of types after lowercase", len(set(tokens_updated)))
print("\n")

Exemplo n.º 16

0

Exibir arquivo

Arquivo: ParserWithStemming.py Projeto: hallelhel/Search_Engine

    def text_operation(self, text):
        """
        This function takes a list of tokenizes and manipulate every token to his case
        param: text: list of tokens.
        :return: the text after parser.
        """
        len_text = len(text)
        tokenAfterParse = []
        counter = -1
        for term in text:
            counter = counter + 1
            if self.per == True:
                self.per = False
                continue
            if self.per2 == True:
                self.per = True
                self.per2 = False
                continue
            if term == " " or term == '' or "http" in term:
                continue

            if term[-1] in string.punctuation or ord(term[-1]) < 48 or ord(
                    term[-1]) > 127:  # to remove anything that is not a word or number in the end of the word
                if term[-1] != '%':
                    while term[-1] in string.punctuation or ord(term[-1]) < 48 or ord(term[-1]) > 127:
                        term = term[:-1]
                        if term == "":
                            break
                if term == "":
                    continue
                # text[counter] = term FIXME happen in line 160

            # hashtag & tags cases:
            if term[0] in string.punctuation or ord(term[0]) > 127:
                if term[0] == '#' and len(term) > 2:
                    # if len(term) == 2:
                    #     continue
                    words = self.hashtag_tokenize(
                        term[1:])  # this func split the words and add the original hashtag with lower case to words
                    tokenAfterParse.extend(words)
                    continue
                elif term[0] != '@':
                    while term[0] in string.punctuation:
                        term = term[1:]
                        if len(term) < 2:
                            break
                    if term == "":
                        continue
                    text[counter] = term

            # number cases - dates/percentage:
            if term.startswith('covid') or term.startswith('Covid') or term.startswith('COVID'):
                tokenAfterParse.append('covid19')
                continue

            if term.startswith('corona') or term.startswith('Corona') or term.startswith('CORONA'):
                tokenAfterParse.append('corona')
                continue

            term = self.clean_word(term)
            if isinstance(term, list):
                continue

            # try to minimize the covid terms
            if (term.isdigit() or term[0].isdigit()) and not (re.search('[a-zA-Z]', term)):
                if counter + 1 < len_text and term.isdigit():
                    if text[counter + 1] in self.month_dict:  # Date
                        tokenAfterParse.append(self.Date_Toknize(term, text, counter, len_text))
                        continue
                    if text[counter + 1] == "percent" or text[counter + 1] == "percentage" or text[
                        counter + 1] == "Percent" or text[counter + 1] == "Percentage":  # %
                        new_word = term + text[counter + 1]
                        tokenAfterParse.append(new_word)
                        self.per = True
                        continue
                new_number = self.numbeOpertion(term, text, counter, len_text)
                tokenAfterParse.append(new_number)
                continue

            # check entity
            if counter + 1 < len_text:
                if term[0].isupper() and text[counter + 1][0].isupper():  # words with big letter
                    name = self.entity(text, counter, len_text)
                    tokenAfterParse.append(name)
                    tokenAfterParse.append(term)
                    continue
                if term in self.month_dict and text[counter + 1].isdigit():
                    tokenAfterParse.append(self.Date_Toknize(term, text, counter, len_text))
                    continue
            # replace every number from one to ten to digits:
            elif term in self.dict_numbers.keys():
                term = self.dict_numbers[term]

            term = self.stemmer.stem(term)
            tokenAfterParse.append(term)
        return tokenAfterParse

Exemplo n.º 17

0

Exibir arquivo

#we will store Beautiful soup into a variable s this variable will search for display elements in the html like h1,paragraph p and so on and give the required information into a
#variable rel.

s = BeautifulSoup(rw, 'html.parser')
rel = ""
for relevance in s.find_all('h1'):
    rel += (relevance.text)
for relevance in s.find_all('p'):
    rel += (relevance.text)
for relevance in s.find_all('h2'):
    rel += (relevance.text)

#rel_tokens_nopunct willtraverse through rel and remove all punctuation marks and divide the text into tokens
#Using set on our rel_tokens_nopunct function we will get the unique types from the tokens
rel_tokens_nopunct = [
    word for word in word_tokenize(rel) if re.search("\w", word)
]
print("The length of tokens from the url before Lemmatization are:",
      len(rel_tokens_nopunct))
print("\n\nThe length of types from the url before Lemmatization are:",
      len(set(rel_tokens_nopunct)))
print("\n\nThe tokens contained in the website:" + url +
      " before Lemmatization are:\n\n")
print(rel_tokens_nopunct)
print("\n\nThe types contained in the website:" + url +
      " before Lemmatization are:\n\n")
print(set(rel_tokens_nopunct))

#We will use the Lemmatizer function to Lemmatize the words and store the lemmatized values for each of the values in l1,l2,l3
lem = WordNetLemmatizer()
l = [lem.lemmatize(i, pos='v') for i in rel_tokens_nopunct]

Exemplo n.º 18

0

Exibir arquivo

def str2token(string_data):
    tokens = word_tokenize(string_data)
    lowercased_tokens = list(map(lambda x: x.lower(),tokens))
    word_tokenized = [word for word in lowercased_tokens if re.search("\w",word)]
    return word_tokenized

Exemplo n.º 19

0

Exibir arquivo

Arquivo: TokensAndTagging.py Projeto: AlexEbbage/UniversityWork

with untried nuclear technology in someone else's school. How exciting is that! I can't have been the only
one to feel a warm glow at the thought of so much radioactivity at the very heart of the school. Who
knows, by this time next year I could be the Two-head-master! (Finkelstein, D., you're on fire – as is the
boiler room!)
D.C.

"""

# Print the PrivateEye text.
print(text)

# Use NLTKs method to tokenize the text, then convert the text to lowercase and remove all the punctuation.
text_tokens = word_tokenize(text)
text_sent = sent_tokenize(text)
text_tokens_nopunct = [
    word.lower() for word in text_tokens if re.search("\w", word)
]

# Use NLTKs POS tagger method to tag all the tokens.
pos_tagged_tokens = nltk.pos_tag(text_tokens_nopunct)

print("Training data and generating the token-tag table...\n")

# Creates trainer data using the Brown corpus.
trainer_data = brown.tagged_sents()[:10000]

# Trains an HMM tagger using the trainer data, then uses it to tag all the tokens.
hmm_trainer = nltk.hmm.HiddenMarkovModelTrainer()
hmm_tagger = hmm_trainer.train_supervised(trainer_data)
hmm_tagged_tokens = []
for s in text_sent: