Exemplo n.º 1
0
def returnNames(url):
        
    theurl = "https://raw.githubusercontent.com/robincamille/replacethechar/master/texts/biblekjv.txt"
    #raw_input("URL to .txt file: ")
    sourcefile = urllib2.urlopen(theurl)
    source = sourcefile.read()
    
    #Tokenize
    sourcetok = tok(source[:partition])
    
    #Tag POS
    sourcetag = postag(sourcetok)
    
    #Outputs POS-tagged text
    sourcene = ne(sourcetag, binary=False)
    
    charsall = []
    for n in sourcene:
        if type(n) == tree.Tree:
            if n.label() == 'PERSON':
                for m in n:
                    charsall.append(m[0])
    
    honorifics = ['Mr.', 'Mrs.', 'Ms.', 'Miss', 'Dr.', 'Prof.', 'Professor', 'Lord', 'Lady', 'Sir', 'Madam', 'Dame', 'Rev.', 'Rabbi']
    
    charsallnames = []
    for s in charsall:
        if s in honorifics:
            pass
        else:
            charsallnames.append(s)
    
    counted = (word for word in charsallnames if word[:1].isupper())
    c = Counter(counted)
    charscommon = c.most_common(5)
    
    chars = []
    for s in charscommon:
        chars.append(s[0])
    
    print '\nMost common names:'
    print '\t'.join(chars)
    return chars,source 
Exemplo n.º 2
0
from collections import Counter

infile = open('data_columns/nussbaum/nuss01.txt',
              'r')  # Put your filename here
source = infile.read()
source = source.decode('utf-8')
infile.close()

print 'Tokenizing'
sourcetok = tok(source)

print 'Tagging Part Of Speech (POS)...'
sourcetag = postag(sourcetok)

print 'Running POS-tagged text through Named Entity chunker...'
sourcene = ne(sourcetag, binary=False)

# Find just the Named Entities that we want
charsall = []
for n in sourcene:
    if type(n) == tree.Tree:
        #if n.label() == 'PERSON':

        if n.node == 'PERSON':  #Options: PERSON, ORGANIZATION, LOCATION
            for m in n:
                charsall.append(m[0])

# Exclude honorifics
honorifics = [
    'Mr.', 'Mrs.', 'Ms.', 'Miss', 'Dr.', 'Prof.', 'Professor', 'Lord', 'Lady',
    'Sir', 'Madam', 'Dame', 'Rev.', 'Rabbi'
Exemplo n.º 3
0
from nltk.corpus import gutenberg as gb
from collections import Counter

theurl = raw_input("URL to .txt file: ")
sourcefile = urllib2.urlopen(theurl)
source = sourcefile.read()


# Tokenize
sourcetok = tok(source)

# Tag POS
sourcetag = postag(sourcetok)

# Outputs POS-tagged text
sourcene = ne(sourcetag, binary=False)

charsall = []
for n in sourcene:
    if type(n) == tree.Tree:
        if n.label() == "PERSON":
            for m in n:
                charsall.append(m[0])

honorifics = [
    "Mr.",
    "Mrs.",
    "Ms.",
    "Miss",
    "Dr.",
    "Prof.",
def returnNames(url):

    theurl = "http://www.ccel.org/ccel/bible/kjv.txt"
    # raw_input("URL to .txt file: ")
    sourcefile = urllib2.urlopen(theurl)
    source = sourcefile.read()

    # Tokenize
    sourcetok = tok(source[:partition])

    # Tag POS
    sourcetag = postag(sourcetok)

    # Outputs POS-tagged text
    sourcene = ne(sourcetag, binary=False)

    charsall = []
    for n in sourcene:
        if type(n) == tree.Tree:
            if n.label() == "PERSON":
                for m in n:
                    charsall.append(m[0])

    # exclude from names:
    honorifics = [
        "Mr.",
        "Mrs.",
        "Ms.",
        "Miss",
        "Dr.",
        "Prof.",
        "Professor",
        "Lord",
        "Lady",
        "Sir",
        "Madam",
        "Dame",
        "Rev.",
        "Rabbi",
        "Version",
        "Gutenberg",
    ]

    charsallnames = []
    for s in charsall:
        if s in honorifics:
            pass
        else:
            charsallnames.append(s)

    counted = (word for word in charsallnames if word[:1].isupper())
    c = Counter(counted)
    charscommon = c.most_common(5)

    chars = []
    for s in charscommon:
        chars.append(s[0])

    # print '\nMost common names:'
    # print '\t'.join(chars)
    return chars, source