def returnNames(url): theurl = "https://raw.githubusercontent.com/robincamille/replacethechar/master/texts/biblekjv.txt" #raw_input("URL to .txt file: ") sourcefile = urllib2.urlopen(theurl) source = sourcefile.read() #Tokenize sourcetok = tok(source[:partition]) #Tag POS sourcetag = postag(sourcetok) #Outputs POS-tagged text sourcene = ne(sourcetag, binary=False) charsall = [] for n in sourcene: if type(n) == tree.Tree: if n.label() == 'PERSON': for m in n: charsall.append(m[0]) honorifics = ['Mr.', 'Mrs.', 'Ms.', 'Miss', 'Dr.', 'Prof.', 'Professor', 'Lord', 'Lady', 'Sir', 'Madam', 'Dame', 'Rev.', 'Rabbi'] charsallnames = [] for s in charsall: if s in honorifics: pass else: charsallnames.append(s) counted = (word for word in charsallnames if word[:1].isupper()) c = Counter(counted) charscommon = c.most_common(5) chars = [] for s in charscommon: chars.append(s[0]) print '\nMost common names:' print '\t'.join(chars) return chars,source
from collections import Counter infile = open('data_columns/nussbaum/nuss01.txt', 'r') # Put your filename here source = infile.read() source = source.decode('utf-8') infile.close() print 'Tokenizing' sourcetok = tok(source) print 'Tagging Part Of Speech (POS)...' sourcetag = postag(sourcetok) print 'Running POS-tagged text through Named Entity chunker...' sourcene = ne(sourcetag, binary=False) # Find just the Named Entities that we want charsall = [] for n in sourcene: if type(n) == tree.Tree: #if n.label() == 'PERSON': if n.node == 'PERSON': #Options: PERSON, ORGANIZATION, LOCATION for m in n: charsall.append(m[0]) # Exclude honorifics honorifics = [ 'Mr.', 'Mrs.', 'Ms.', 'Miss', 'Dr.', 'Prof.', 'Professor', 'Lord', 'Lady', 'Sir', 'Madam', 'Dame', 'Rev.', 'Rabbi'
from nltk.corpus import gutenberg as gb from collections import Counter theurl = raw_input("URL to .txt file: ") sourcefile = urllib2.urlopen(theurl) source = sourcefile.read() # Tokenize sourcetok = tok(source) # Tag POS sourcetag = postag(sourcetok) # Outputs POS-tagged text sourcene = ne(sourcetag, binary=False) charsall = [] for n in sourcene: if type(n) == tree.Tree: if n.label() == "PERSON": for m in n: charsall.append(m[0]) honorifics = [ "Mr.", "Mrs.", "Ms.", "Miss", "Dr.", "Prof.",
def returnNames(url): theurl = "http://www.ccel.org/ccel/bible/kjv.txt" # raw_input("URL to .txt file: ") sourcefile = urllib2.urlopen(theurl) source = sourcefile.read() # Tokenize sourcetok = tok(source[:partition]) # Tag POS sourcetag = postag(sourcetok) # Outputs POS-tagged text sourcene = ne(sourcetag, binary=False) charsall = [] for n in sourcene: if type(n) == tree.Tree: if n.label() == "PERSON": for m in n: charsall.append(m[0]) # exclude from names: honorifics = [ "Mr.", "Mrs.", "Ms.", "Miss", "Dr.", "Prof.", "Professor", "Lord", "Lady", "Sir", "Madam", "Dame", "Rev.", "Rabbi", "Version", "Gutenberg", ] charsallnames = [] for s in charsall: if s in honorifics: pass else: charsallnames.append(s) counted = (word for word in charsallnames if word[:1].isupper()) c = Counter(counted) charscommon = c.most_common(5) chars = [] for s in charscommon: chars.append(s[0]) # print '\nMost common names:' # print '\t'.join(chars) return chars, source