示例#1
0
# Contar quantidade de espaços
len(re.findall('\s+', text))

# Capitulo I
with open("bases/Iracema-jose-de-alencar-Cap1.txt") as c1:
    Cap1 = c1.read()

Cap1 = remover_acentos(Cap1)

# Remove os espaços em branco
# Capítulo I
Cap1SE = re.sub(r'\s', '', Cap1)

# N-Gram (Treinamento?)
allMyWords = Cap1.split()
Cap1_nGram = obo.getNGrams(allMyWords, 3)  # TriGram
print(obo.getNGrams(allMyWords, 3))

# Usando Markov
import random


class MarkovChain:
    def __init__(self):
        self.memory = {}

    def _learn_key(self, key, value):
        if key not in self.memory:
            self.memory[key] = []
        self.memory[key].append(value)
示例#2
0
[sent[i:i+n] for i in range(len(sent)-n+1)]

##########
# N-Gram #
##########
# http://www.ling.helsinki.fi/kit/2008s/clt231/nltk-0.9.5/doc/en/book.html#n_gram_tagger_index_term
# http://tetration.xyz/Ngram-Tutorial/

# https://programminghistorian.org/lessons/keywords-in-context-using-n-grams
import obo

wordstring = 'it was the best of times it was the worst of times '
wordstring += 'it was the age of wisdom it was the age of foolishness'

allMyWords = wordstring.split()
print(obo.getNGrams(allMyWords, 3))

# Detalhes
wordfreq = []
for w in allMyWords:
    wordfreq.append(allMyWords.count(w))

print("-> DETALHES <-")
print("String\n" + wordstring +"\n")
print("List\n" + str(allMyWords) + "\n")
print("Frequencies\n" + str(allMyWords) + "\n")
print("Pairs\n" + str(zip(allMyWords, wordfreq)))

#############################
# Separa e junta caracteres #
#############################
示例#3
0
# html-to-kwic.py

import obo

# create dictionary of n-grams
n = 7
url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33'

text = obo.webPageToText(url)
fullwordlist = ('# ' * (n//2)).split()
fullwordlist += obo.stripNonAlphaNum(text)
fullwordlist += ('# ' * (n//2)).split()
ngrams = obo.getNGrams(fullwordlist, n)
worddict = obo.nGramsToKWICDict(ngrams)

# output KWIC and wrap with html
target = 'black'
outstr = '<pre>'
if worddict.has_key(target):
    for k in worddict[target]:
        outstr += obo.prettyPrintKWIC(k)
        outstr += '<br />'
else:
    outstr += 'Keyword not found in source'

outstr += '</pre>'
obo.wrapStringInHTMLMac('html-to-kwic', url, outstr)
示例#4
0
#get-keywords.py

import obo

test = 'this test sentence has eight words in it'
ngrams = obo.getNGrams(test.split(), 5)

print(obo.nGramsToKWICDict(ngrams))