示例#1
0
def extract_entities2(text):
	entities = []
	
	"""t0 = nltk.DefaultTagger('NN')
	t1 = nltk.UnigramTagger(train_sents, backoff=t0)
	t2 = nltk.BigramTagger(train_sents, backoff=t1)
	t2.evaluate(test_sents)"""
	
	for sentence in sent_tokenize(text):
	    #print pos_tag(nltk.word_tokenize(sentence))
	    print sentence
	    tags=pos_tag(nltk.word_tokenize(sentence))
	    tags=tagear(tags)
	    chunks = ne_chunk(pos_tag(nltk.word_tokenize(sentence)))
	    #chunks = ne_chunk(regexp_tagger.tag((nltk.word_tokenize(text))))
	    chunks = ne_chunk(tags)
	    #chunks.draw()
	    #print chunks
	    for chunk in chunks:
	    	#print chunk
	    	#if hasattr(chunk, 'node'):
	    	#	print chunk.node
	    	if hasattr(chunk, 'node') :
	    		print chunk	
	    		entities.extend([chunk for chunk in chunks if hasattr(chunk, 'node')])
	return entities
    def test_nltkNERParsing(self):
        testString = 'Natural Sciences and Engineering Research Council of Canada'
        unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams()
        posTagged = nltk.pos_tag(unigrams)
        chunked = nltk.ne_chunk(posTagged)
        getGPEs = []

        for treeBranch in chunked:
            if hasattr(treeBranch, 'label') and treeBranch.label() == 'GPE':
                getGPEs.append(str(treeBranch))

        self.assertEqual(1, len(getGPEs))

        testString = 'Milwaukee Foundation'
        unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams()
        posTagged = nltk.pos_tag(unigrams)
        chunked = nltk.ne_chunk(posTagged)
        # returns (S (PERSON Milwaukee/NNP) (ORGANIZATION Foundation/NNP))

        testString = 'New England Board of Higher Education'
        unigrams = TokenizeOnWhitespacePunctuation(testString, keepCaps=True).getUnigrams()
        posTagged = nltk.pos_tag(unigrams)
        chunked = nltk.ne_chunk(posTagged)
        # returns (S (GPE New/NNP)(ORGANIZATION England/NNP Board/NNP) of/IN (PERSON Higher/NNP Education/NNP))

        testString = 'New England Board of Higher Education'
        unigrams = TokenizeOnWhitespacePunctuation(testString).getUnigrams()
        posTagged = nltk.pos_tag(unigrams)
        chunked = nltk.ne_chunk(posTagged)
示例#3
0
def nameEntityExtract(document):
	sentences = nltk.sent_tokenize(document)
	sentences = [nltk.word_tokenize(sent) for sent in sentences]
	sentences = [nltk.pos_tag(sent) for sent in sentences]
	print sentences[0]
	print "the length of sentences is: " + str(len(sentences))
	sent = sentences[0]
	print nltk.ne_chunk(sent,binary=True)
def English_NER(sentence):
    # 命名实体只被标注为NE
    print '命名实体只被标注为NE:'
    print nltk.ne_chunk(sentence, binary=True)

    # 命名实体会添加类型标签,例如PERSON,ORGANIZATION,GPE等
    print '命名实体会添加类型标签,例如PERSON,ORGANIZATION,GPE等:'
    print nltk.ne_chunk(sentence)
示例#5
0
文件: lookup.py 项目: nytlabs/linguo
def extractNE(sentence, withClass):
    words = nltk.word_tokenize(sentence)  # Extract words from sentence: Stopwords removed, punctuations removed
    if withClass:
        tree = nltk.ne_chunk(nltk.pos_tag(words), binary=False)
        return extractNEwithClass(tree)
    else:
        tree = nltk.ne_chunk(nltk.pos_tag(words), binary=True)
        return extractNEwithoutClass(tree)
def main():
    sent = nltk.corpus.treebank.tagged_sents()[22]
    print "sent (nltk):", sent
    #print nltk.ne_chunk(sent, binary=True)
    #print nltk.ne_chunk(sent)

    sent = ie_preprocess("""Injured personnel consisting of six Schlum employees were immediately transported
                        to nearby hospitals and most of them (were)
                        discharged after having received treatment""")
    print sent
    print nltk.ne_chunk(sent[0])
def process_contents():
    try:
        for i in tokenized[5:]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged) #White, House
            namedEnt = nltk.ne_chunk(tagged, binary = True) #White House
            namedEnt.draw()

    except Exception as e:
        print(str(e))
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i);
            tagged = nltk.pos_tag(words)

            namedEnt1 = nltk.ne_chunk(tagged) #Give all named entities with category
            namedEnt2 = nltk.ne_chunk(tagged, binary=True) #This gives named entity without category

            namedEnt2.draw()

    except Exception as e:
        print(str(e))
    def entity_names(self, tuple_list = None):

        if tuple_list is None:
            tuple_list = self.updated_element        
        
        # Recognize the names of the entities contained in the string
        tree = nltk.ne_chunk(tuple_list, binary=False)
        
        # At this point instead of the entity names I substitute the 
        # name of the entity. In order to recognize the names having the 
        # entity name I simply make a check on the final name of the element
        # if it is different from 'S', then it was a valid node
        for el in tree:
            
            if type(el) == nltk.tree.Tree:
                # If it was an entity then the tag can be 
                # stored as a name
                self.entity_named.append((el.node, 'NNP'))
            else:
                self.entity_named.append(el)
        
        # Print
        # print 'Named entities', self.entity_named
        
        # I update the element
        self.updated_element = self.entity_named
        
        return self.updated_element
示例#10
0
def get_xmen_text(soup):
    
    #en_stopwords = set(nltk.corpus.stopwords.words('english'))
    raw = nltk.clean_html(str(soup))
    raw_trunc = raw[:raw.rfind('References')]
    sents = nltk.sent_tokenize(raw_trunc)
    words = [nltk.word_tokenize(sent) for sent in sents]
    poss = [nltk.pos_tag(word) for word in words]
    #nes = [nltk.ne_chunk(pos, binary=True) for pos in poss]
    #for pos in poss: print pos
    poss_filter = [filter_insignificant(pos, tag_suffixes=['DT']) for pos in poss]
    print poss_filter
    nes = [nltk.ne_chunk(poss_filter, binary=True) for pos in poss_filter]
    
    def sub_leaves(tree, node):
        return [t.leaves() for t in tree.subtrees (lambda s: s.node == node)]
    
    people = [sub_leaves(ne, 'NE') for ne in nes]
    people = [item for sublist in people
              for subsublist in sublist
              for subsubsublist in subsublist
              for item in subsubsublist
              if item not in ('NNP', 'NN', 'NNPS', 'JJ')]
    people = merge_people(people)
    fd = nltk.FreqDist(person for person in people if person!='Magneto')
    fd.plot(50)
示例#11
0
def ne_tag(sentences):
    tagged = raw_trigram_tag(sentences, tagger_file="tagger.pkl")[1]
    fin = []
    for tagged_sent in tagged:
        # print tagged_sent
        fin.append(nltk.ne_chunk(tagged_sent))
    return fin
示例#12
0
def extract_ent():
    
    data_dir = "/Users/Brishti/Documents/Internships/scripts/"
    inputfile = open(data_dir + 'output3.txt', 'r')
    # outputfile = open(data_dir + 'entity.txt', 'w')
    
    for line in inputfile:
        # print("Looking at: " + line)
        if re.match("^\s*$", line):
            next
        line = line.split("|")
        # print("Length is: " + str(len(line)))
        # print line[2]
    
        
        for sent in nltk.sent_tokenize(line[2]):
            print("______")
            # print sent
            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
                # print nltk.pos_tag(nltk.word_tokenize(sent))
                print chunk
                #if hasattr(chunk, 'label') and chunk.label() == "PERSON":
                    # print chunk.leaves()
                    #print(line[0] + '|' +' '.join(c[0] for c in chunk.leaves())+'\n')
                    # outputfile.write(line[0] + '|' +' '.join(c[0] for c in chunk.leaves())+'\n')

    inputfile.close()
def processor(data):
    try:
        tokenized = nltk.word_tokenize(data)
        tagged = nltk.pos_tag(tokenized)
        namedEnt = nltk.ne_chunk(tagged, binary=True)

        entities = re.findall(r'NE\s(.*?)/', str(namedEnt))
        #     ('
        descriptives = re.findall(r'\(\'(\w*)\'.\s\'JJ\w?\'',str(tagged))
        if len(entities) > 1:
            pass
        elif len(entities) == 0:
            pass
        elif str(entities) == '_blank':
            pass
        else:
            print 'Named: ', entities[0]
            print 'Description: '
            for eachDesc in descriptives:
                print eachDesc
                currentTime = time.time()
                dateStamp = datetime.datetime.fromtimestamp(currentTime).strftime('%Y-%m-%d %H:%M:%S')
                namedEntity = entities[0]
                relatedWord = eachDesc
                c.execute("INSERT INTO knowledgeBase (unix, dateStamp, namedEntity, relatedWord) VALUES (?,?,?,?)",
                          (currentTime, dateStamp, namedEntity, relatedWord))

                conn.commit()


                

    except Exception, e:
        print 'failed in the first try of processor'
        print str(e)
示例#14
0
文件: ner.py 项目: aregee/scrappy
    def ne_chunk(self, tweet_id, tweet_text):

        sent_ner = []
        ner_tweets = {}
        sents = nltk.sent_tokenize(tweet_text)

        for text in sents:
            text = nltk.word_tokenize(text)
            text = nltk.pos_tag(text)
            text = nltk.ne_chunk(text)


            ner_list = self.getNodes(text, [])
            if ner_list:
                sent_ner.extend(ner_list)

            #ner_list = self.get_entity_value(text)
            #sent_ner.extend(ner_list)

            for each_ner in ner_list:
                if each_ner[1] in ner_tweets and tweet_id not in ner_tweets[each_ner[1]]:
                    ner_tweets[each_ner[1]].append(tweet_id)
                else:
                    ner_tweets[each_ner[1]] = [tweet_id]
        return sent_ner, ner_tweets
	def recuperarEntidadesEn(texto):
		ObjTag = Tokenizar()
		ObjDes = Desambiguar()
		Lista = []
		Lista2= []
		for sentence in sent_tokenize(texto):
			#print sentence  
			tags=ObjTag.tagear(sentence)
			#tags=tagear(traducir(word_tokenize(sentence)))
			print tags
			parsed = ne_chunk(tags)
			print parsed
			for chunk in parsed:
				#print chunk
				#if hasattr(chunk, 'node'):
				#	print chunk.node
				if hasattr(chunk, 'node'):
					#print chunk	
					#print chunk.leaves()
					Lista2.append(chunk.leaves()[0])
					#print ' '.join(c[0] for c in chunk.leaves())
					Lista.append (' '.join(c[0] for c in chunk.leaves()))
			print Lista2
			print ObjDes.DesambiguarTexto(Lista2, sentence)
			Lista2=[]
		return Lista
示例#16
0
def process_content():
    for i in custom_tokenized[5:]:
        words = word_tokenize(i)
        tagged = nltk.pos_tag(words)
        namedEnt = nltk.ne_chunk(tagged);

        print(namedEnt)
def processor(data):
    namedEntArray = []
    try:
        tokenized = nltk.word_tokenize(data)
        tagged = nltk.pos_tag(tokenized)
        namedEnt = nltk.ne_chunk(tagged, binary=True)

        entities = re.findall(r'NE\s(.*?)/',str(namedEnt))
        #('not', 'RB')
        descriptives = re.findall(r'\(\'(\w*)\',\s\'JJ\w?\'', str(tagged))
        if len(entities) > 1:
            pass
        elif len(entities) == 0:
            pass
        else:
            print '_________________________'
            print 'Named:',entities[0]
            print 'Descriptions:'
            for eachDesc in descriptives:
                print eachDesc
            

    except Exception, e:
        print 'failed in the main try of processor'
        print str(e)
        time.sleep(555)
示例#18
0
文件: Parser.py 项目: jcccf/cs4740
def parse_questions():
  print "Parsing Questions..."
  parsed_questions = {}
  with open(DIR+'/questions.txt', 'r') as f:
    data = f.read()
    questions = re.split('[\s]*</top>[\s]*', data)
    if len(questions[-1].strip()) == 0: questions.pop()
    qc = QuestionClassifier.QuestionClassifier()
    for question in questions:
      question_number = int(re.search(r"<num>[\s]*Number:[\s]*([0-9]+)", question).group(1))
      question = re.search(r"<desc>[\s]*Description:[\s]*([a-zA-Z0-9\-\?\'\. ]+)", question).group(1)
      question_words = nltk.word_tokenize(question)
      question_pos = nltk.pos_tag(question_words)
      question_nes = nltk.ne_chunk(question_pos)
      question_tree = Chunker.chunker.parse(question_pos)
      question_classification = qc.classify(question)
      qwords, nouns, nes = [], [], []
      for part in question_nes:
        try:
          nes.append((part.node, part.leaves()[0][0]))
        except:
          if part[1] == 'WP' or part[1] == 'WRB':
            qwords.append(part[0])
          elif part[1] == 'NN' or part[1] == 'NNP':
            nouns.append(part[0])
      # print qwords, nouns, nes
      # print question_pos
      parsed_questions[question_number] = { "question": question, "pos": question_pos, "ne": question_nes, "parse_tree": question_tree, "question_classification": question_classification, "question_words": qwords, "nouns": nouns, "ne_words": nes }
  with open(DIR+'/parsed_questions.txt', 'wb') as f:
    pickle.dump(parsed_questions, f)
示例#19
0
文件: analyzer.py 项目: gdamdam/sumo
	def get_entities(self,sentences):
		""" The function returns the dictionary containing the results for
		the Name Entity Recognition analyze.

		Args:
		   sentences: the sentences list.

		Returns:
			dictionary:
		"""
		entities = dict([])

		# Tokenization
		tokens = [nltk.tokenize.word_tokenize(s) for s in sentences]

		# Part-Of-Speech tagging
		pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]

		# Chunking
		chunked_nes = [nltk.ne_chunk(c) for c in pos_tagged_tokens]

		for tree in chunked_nes:
			for s in tree.subtrees(lambda t: (t.height()==2)):
				if s.label()!='S':
					entity = ' '.join(i[0] for i in s.leaves())
					if s.label() in entities.keys():
						if entity not in entities[s.label()]:
							entities[s.label()].append(entity)
							entities[s.label()].sort()
					else:	
						entities[s.label()] = [entity]

		return entities
示例#20
0
def get_NERs(path_to_seg):
    NER_dict = {} # map entities to counts (i.e., # of occurences in this seg)
    NERs_to_types = {} # map the NERs to the kinds of things they are

    seg_text = open(path_to_seg).read()
    
    # strip *all* tags 
    seg_text = strip_tags(seg_text, get_tags_in_text(seg_text))

    # tokenize, then POS text
    pos_tagged_seg = nltk.pos_tag(nltk.word_tokenize(seg_text))

    # and now the NER
    NERd_seg = nltk.ne_chunk(pos_tagged_seg)

    # kind of hacky, but this is how I'm parsing
    # the induced tree structure
    for subtree in NERd_seg:
        # then this is an NER
        if type(subtree) == nltk.tree.Tree:
            # ignoring the *type* of NER for now -- i can't think of a
            # case in which we'd care (typically, entities with the same
            # name *ought* to be of the same type, I think...)
            entity = subtree[0][0] # this parses out the token (entity) itself
            entity_type = subtree.node
            # if we've already encountered it, just bump the count
            if entity in NER_dict:
                NER_dict[entity] += 1
            else:
                NER_dict[entity] = 1
                NERs_to_types[entity] = subtree.node ### going to assume we always get this correct, I guess
    
    return NER_dict, NERs_to_types
def processor(data):
    try:
        tokenized = nltk.word_tokenize(data)
        tagged = nltk.pos_tag(tokenized)
        namedEnt = nltk.ne_chunk(tagged, binary=True)
        #print (namedEnt)
        #time.sleep(55)

        entities = re.findall(r'NE\s(.*?)/',str(namedEnt))
        descriptives_adj = re.findall(r'\(\'(\w*)\',\s\'JJ\w?\'',str(tagged))
        '''if len(entities) > 1:
            pass
        elif len(entities) == 0:
            pass
        else: '''
        print ('Sentence with POS-tagging : ')
        print (str(tagged))
        print ('-----------------------------------------------')
        print ('Named Entity of the Sentence : ',entities)
        print ('Descriptions : ')
        for desc in descriptives:
          print (desc)

 
    except Exception as e:
        print ('Failed in the first loop of processor')  
        print (str(e))
示例#22
0
文件: linkage.py 项目: rchiba/HipTrip
 def extract_normal_ne(self, text):
     result = []
     for sent in sent_tokenize(text) if text else []:
         for chunk in ne_chunk(pos_tag(word_tokenize(sent))):
             if hasattr(chunk, "node"):
                 result.append(" ".join([c[0] for c in chunk.leaves()]))
     return result
示例#23
0
def extract_named_entities(request):
    """
    Uses the NLTK to extract named entities from a given text.
    """
    named_entities = []
    if request.GET:
        if 'text' not in request.GET:
            return HttpResponse('Please enter the text to analyze')
    else:
        return HttpResponse('Please enter the text to analyze')
    try:
        text = request.GET["text"]
        tokenized = nltk.word_tokenize(text)
        tagged = nltk.pos_tag(tokenized)
        result = nltk.ne_chunk(tagged)
        if len(result.productions()) > 1:
            for ne in result.productions()[1:]:
                name = ne.rhs()[0][0]
                pos_tag = ne.rhs()[0][1]
                inferred_type = ne.lhs().symbol()
                named_entities.append( {"name":name, "pos_tag":pos_tag, "guessed_type":inferred_type,} )
    except:
        return HttpResponse('Failed to extract named entitied from text "%s": %s' % (text, str(sys.exc_info()[1])) )

    return HttpResponse(json.dumps(named_entities))
示例#24
0
def extract_concepts(text):
    """
    Uses the NLTK natural language processing library to 
    extract from a text the essential terms that appeared in it.
    """
    try:
        ignored_words = corpus.stopwords.words('english')
        ignored_words.append("n't")
        appeared = {}
        concepts = []
        tokenized = nltk.word_tokenize(text)
        tagged = nltk.pos_tag(tokenized)
        named_entities = nltk.ne_chunk(tagged)
        
        for ne in named_entities.leaves():
            #if ne[1] in ('NNS', 'NNP', 'NN'):
            if len(ne[0]) > 2 and ne[0].lower() not in ignored_words and not (ne[0].startswith("http") or ne[0].startswith("//")):
                name = ne[0]
                if name in appeared:
                    continue
                concepts.append(name)
                appeared[name] = True
    except:
        print "extract concepts failed:", sys.exc_info()
    return concepts
示例#25
0
def _getAnswer(self, text, extract_node): 
	try: 
		answer_list = []
		
		# To remove extra spacea and special characteres from text
		text = re.sub(r'\W+\d+\s+.,\'"&', '', text) 
		
		#Start extraction process from the text
		# Sentence Tokenization
		for sent in nltk.sent_tokenize(text): 
		
			# Word Tokenization and pos tagging
			# Create chunks of the text which may have the answer
			for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
				
				#print chunk
				if hasattr(chunk, 'node'): 
					# Check if the cunkes contain the needed node
					if chunk.node == extract_node:
						performer = ' '.join(c[0] for c in chunk.leaves()) 
						answer_list.append(performer)
						# Create a central result set
						result.append(performer)
		return answer_list 
	except: 
		print " ERROR: Couldn't perform named entity recognition on this text"
示例#26
0
 def word_tokenize(sent):
     nonlocal time_pos
     nonlocal time_chunk
     
     # replace typographic marks with simple marks
     sent = sent.replace('…', '...')
     sent = sent.replace('”', "''")
     sent = sent.replace('“', ',,')
     sent = sent.replace(',', ',')
     sent = sent.replace('’', "'")
     
     words = nltk.word_tokenize(sent)
     # strip punctuation from words
     words = [word.strip(string.punctuation) for word in words]
     words = [word for word in words if len(word) > 0]
     
     if not analyse_pos:
         return words
     else:
         start = time.time()
         tagged = tagger.tag(words)
         time_pos += (time.time() - start)
     
         if preserve_entities:
             start = time.time()
             chunks = nltk.ne_chunk(tagged, binary=ner_binary)
             time_chunk += (time.time() - start)
         
             word_list = []
             ne_concat(chunks, word_list)
             return word_list
         else:
             return [nltk.tuple2str(t) for t in tagged]
示例#27
0
文件: ner.py 项目: yokeyong/atap
    def get_entities(self, document):
        """
        Extract entities from a single document using the
        nltk.tree.ne_chunk method

        This method is called multiple times by the tranform method

        :param document: a list of lists of tuples
        :return entities: a list of comma-separated strings
        """
        entities = []
        for paragraph in document:
            for sentence in paragraph:
                # classifier chunk the sentences, adds category labels, e.g. PERSON
                trees = ne_chunk(sentence)
                # select only trees with the kinds of entities we want
                for tree in trees:
                    if hasattr(tree, 'label'):
                        if tree.label() in self.labels:
                            # entities is a list, each entry is a list of entities
                            # for a document
                            entities.append(
                                ' '.join([child[0].lower() for child in tree])
                                )
        return entities
示例#28
0
def question_processing(ques):
    global corpus, name, list_query
    list_query = []
    # corpus=[]
    speak(random.choice(choices) + ' ' + name, False)
    # Step1: Generate all tokens
    tokens = nltk.word_tokenize(ques)
    # Step2: Part of Speech tagging of the question
    pos_tags = nltk.pos_tag(tokens)
    # Step3: Named Entity Recoginition of the POS Tags
    pos_tree = nltk.ne_chunk(pos_tags)

    # filter all query words
    for i in pos_tags:
        if i[1] == 'NNP' or i[1] == 'NN' or i[1] == 'JJ' or i[1] == 'JJS' or i[1] == 'NNS' or i[1] == 'VBZ' or i[
            1] == 'RBS':
            list_query.append(i[0])
    # list_query)

    collection_name = []

    # Get the Matching List of Collection(DBs) where the answer could be.
    for i in list_query:
        if dict_collections.get(i.lower()):
            collection_name.append(dict_collections[i.lower()])

    # print(collection_name)

    # Aggerate all the Documents from the list of Collections
    db.cursor = db.questions.find()
    corpus = []
    for i in db.cursor:
        for t in collection_name:
            if t in i:
                corpus.append(i[t])
示例#29
0
def processLanguage():
	try:
		opener = urllib2.build_opener()
		#opener.addheaders[('User-agent','Mozilla/5.0')]

		url = "http://disqus.com/embed/comments/?disqus_version=82d70f54&base=default&f=cnn&t_i=%2F2013%2F12%2F01%2Fpolitics%2Fobamacare-website%2Findex.html&t_u=http%3A%2F%2Fwww.cnn.com%2F2013%2F12%2F01%2Fpolitics%2Fobamacare-website%2Findex.html&t_e=Administration%3A%20Obamacare%20website%20working%20smoothly&t_d=Administration%3A%20Obamacare%20website%20working%20smoothly&t_t=Administration%3A%20Obamacare%20website%20working%20smoothly&t_c=207582&s_o=default#2"

		urlContent = opener.open(url).read()
		soup = BeautifulSoup(urlContent)
		title = soup.title.text


		body = soup.findAll('p')	

		for item in body:
			#print item
			sentence = item.text.encode('ascii','ignore')
			#print sentence
			tokenized = nltk.word_tokenize(sentence)
			tagged = nltk.pos_tag(tokenized)
			namedEntity = nltk.ne_chunk(tagged,binary=True)
			compiler = re.compile("[(]['][a-zA-Z]+[']")
			for chunks in namedEntity:
				#print chunks[0]
				if compiler.match(str(chunks[0])):
					chunk = str(chunks[0])
					front = chunk[2:]
					word = re.search("[a-zA-Z]+[']",front)
					print word.group(0)[:-1]
					#print "Matched"
			#namedEntity.draw()
				
	except Exception, e:
		print str(e)
def named_entities(text, types=None):
    """This functions returns named entities from a text.
    Adapted from emh's code (http://stackoverflow.com/users/2673189/emh)

    Parameters
    ----------
    text: str
        UTF-8 string
    types: list of strings
        Currently the list can include only "PERSON" and "ORGANIZATION"

    Returns
    -------
    dict
        Dictionary with one entry for each type of entity. For each of these 
        entries, contains a list of strings with found entities
    """
    if not types:
        types = ["PERSON", "ORGANIZATION"]
    named_entities = {"PERSON": [], "ORGANIZATION": []}
    tokens = nltk.tokenize.word_tokenize(text)
    pos = nltk.pos_tag(tokens)
    sentt = nltk.ne_chunk(pos, binary=False)
    for type_ in types:
        for subtree in sentt.subtrees(filter=lambda t: t.label() == type_):
            entity = ""
            for leaf in subtree.leaves():
                entity = entity + " " + leaf[0]
            named_entities[type_].append(entity.strip())
    return named_entities
def extract_entity(s):
    return ne_chunk(pos_tag(word_tokenize(s)))
示例#32
0
def preprocessing(tokenize_text):
    pos_tag_text = pos_tag(tokenize_text)
    chunk_text = ne_chunk(pos_tag_text, binary=True)

    return chunk_text
示例#33
0
print('\nperfoming POS:')
text = nltk.word_tokenize(file_content)
print(nltk.pos_tag(text))  #performing POS tag
#POS ends

#lemmatization starts
print('\nperforming lemmatization:')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

for w in wtokens:
    print(lemmatizer.lemmatize(w))  #performing lemmatizer
#lemmatization ends

#trigram starts
print('\nperforming Trigram:')
from nltk import ngrams

n = 3  #n defines the number of ngrams
trigrams = ngrams(file_content.split(), n)  #splitting with respect to n
for grams in trigrams:
    print(grams)
#trigram ends

#Named Entity Recognizer starts
print('\nPerforming NER:')

from nltk import word_tokenize, pos_tag, ne_chunk

print(ne_chunk(pos_tag(word_tokenize(file_content))))
#NER ends
def nltk_tagger(token_text):
	tagged_words = nltk.pos_tag(token_text)
	ne_tagged = nltk.ne_chunk(tagged_words)
	return ne_tagged
def write(filename, predictor):
    sentence = read_sentence(filename)
    for s in sentence:
        sentence_list, label_list = process_sentence(s)
        sen = mergeWords(sentence_list)
        # print(sen)

        #####assign pos#############################################3
        pos_list = []
        # truple = tree2conlltags(ne_chunk(pos_tag(word_tokenize(sen))))
        truple = tree2conlltags(ne_chunk(pos_tag(sentence_list)))
        # the truple contains word, pos, ner-label
        for item in truple:
            pos_list.append(item[1])

        ################get words lemma and stem######################
        wordnet_lemmatizer = WordNetLemmatizer()
        lemma_list = []
        for word in sentence_list:
            lemma_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))

        stem_list = []
        lancaster = LancasterStemmer()
        for word in sentence_list:
            stem_list.append(lancaster.stem(word))
        # print(stem_list)

        #####assign consituency parent pos############################
        pos_parent_list, right_sublings_list, chunk_position, left_sublings_list = parse_consituency_tree(
            sentence_list, predictor)
        # print("=========pos===")
        # print(len(sentence_list))
        # print(len(chunk_position))
        # 追加一行空行
        sentence_list.append(" ")
        label_list.append(" ")
        pos_list.append(" ")
        pos_parent_list.append(" ")
        right_sublings_list.append(" ")
        chunk_position.append(" ")
        lemma_list.append(" ")
        stem_list.append(" ")
        left_sublings_list.append(" ")

        data = {}
        data["word"] = sentence_list
        data["label"] = label_list
        data["pos"] = pos_list
        data["chunk"] = pos_list
        data["pos_parent"] = pos_parent_list
        data["right_sublings_list"] = right_sublings_list
        data["chunk_position"] = chunk_position
        data["lemma_list"] = lemma_list
        data["stem_list"] = stem_list
        data["left_sublings_list"] = left_sublings_list
        df = pd.DataFrame(data)

        # to_filename = "word.csv"
        # df.to_csv(to_filename)
        to_file = filename.split(".tsv")[0]
        to_file1 = to_file + "_feature_v1" + ".tsv"
        df.to_csv(to_file1,
                  sep='\t',
                  index=False,
                  header=False,
                  encoding="utf8",
                  mode='a')
示例#36
0
    ))
lemma = WordNetLemmatizer()
stem = PorterStemmer()
stem_wrds = []
lemma_wrds = []
for token in tokens:
    stem_wrds.extend([stem.stem(token)])
    lemma_wrds.extend([lemma.lemmatize(token)])

print(pos_tag(tokens))
print(stem_wrds)
print(lemma_wrds)

sent = "John Works in FVDS and stays in Chennai"
tokens = word_tokenize(sent)
chunked = ne_chunk(pos_tag(tokens))
for elt in chunked:
    if isinstance(elt, Tree):
        print(elt)

###############################################################################
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
import pandas as pd
from sklearn.metrics import confusion_matrix

inp = pd.read_excel(r"\Movie review.xlsx", encoding='utf-8')
X = inp.SNTC_TXT
y = inp.REVIEW
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
示例#37
0
 def extract_entities(self, doc):
     sentence_list = []
     for sent in sent_tokenize(doc):
         sentence_list.append(
             [chunk for chunk in ne_chunk(pos_tag(word_tokenize(sent)))])
     return sentence_list
示例#38
0
import nltk
from nltk import ne_chunk
NE_sent="The Indian Politicians shouts in the Parliament House"


# In[32]:


NE_tokens=word_tokenize(NE_sent)
NE_tags=nltk.pos_tag(NE_tokens)


# In[33]:


NE_NER=ne_chunk(NE_tags)
print(NE_NER)


# # Chunking
# picking up individual pieces of information and grouping them into bigger pieces

# In[34]:


new = "The cat sat on a mat and ate the rat"
new_Tokens = nltk.pos_tag(word_tokenize(new))
new_Tokens


# In[35]:
示例#39
0
 def parts_of_speech_flow(self, doc):
     sentences = sent_tokenize(doc)
     tokenized = [word_tokenize(sentence) for sentence in sentences]
     pos_tags = [pos_tag(sentence) for sentence in tokenized]
     return ne_chunk(pos_tags, binary=True)
示例#40
0
from nltk import pos_tag

sentence = word_tokenize("I always lie down to tell a lie.")
tags = pos_tag(sentence)
# print(tags)

import nltk

my_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
parser = nltk.ChartParser(my_grammar)

sentence = word_tokenize("I shot an elephant in my pajamas")
for tree in parser.parse(sentence):
    print(tree)
    # tree.draw()

from nltk import pos_tag, ne_chunk

chunk_list = ne_chunk(
    pos_tag(word_tokenize("Antonio joined Udacity Inc. in California.")))
print(chunk_list)
示例#41
0
def extract_named_entities(text):
    entity_names = []
    entities = ne_chunk(pos_tag(word_tokenize(text)), binary=True)
    for tree in entities:
        entity_names.extend(extract_entity_names(tree))
    return entity_names
示例#42
0
#read/create the text data
sent = "John is studying at Stanford University in California"
#Extract the entities
#Using NLTK
#import libraries
import nltk
from nltk import ne_chunk
from nltk import word_tokenize
#NER
ne_chunk(nltk.pos_tag(word_tokenize(sent)), binary=False)

#using spacy
import spacy
nlp = spacy.load('en')
# Read/create a sentence
doc = nlp(u'Apple is ready to launch new phone worth $10000 in New york time square ')
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)
sentences = re.findall(r'(.*?)[\.|\?|!+]',exampleReview)
for sent in sentences:
    print ('******************************************************')
    print ('******************************************************')
    print('The Sentence : ',sent)
    print ('-----------------------------------------------')
    processor(sent)    

'''


sentences = re.findall(r'(.*?)[\.|\?|!+]',exampleReview)
for sent in sentences:
    tokenized = nltk.word_tokenize(sent)
    tagged = nltk.pos_tag(tokenized)
    namedEnt = nltk.ne_chunk(tagged, binary=True)
    entities = re.findall(r'NE\s(.*?)/',str(namedEnt))
        
    descriptives_noun = re.findall(r'\(\'(\w*)\',\s\'NN\w?\'',str(tagged))
    descriptives_verbs = re.findall(r'\(\'(\w*)\',\s\'VB\w?\'',str(tagged))
    descriptives_adj = re.findall(r'\(\'(\w*)\',\s\'JJ\w?\'',str(tagged))
    descriptives_adverb = re.findall(r'\(\'(\w*)\',\s\'RB\w?\'',str(tagged))
    print ('---------------------------------------------------------------------------------------------------------------------')
    print ('*** The Sentence : ***')
    print (sent)
    print ('*** POS-tagged Sentence : ***')
    print (str(tagged))
    print ('*** Named Entity : ***')
    for entity in entities:
        print(entity)
    print ('*** Nouns : ***')    
示例#44
0
 def namedEntities(self, ex):
     ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(self.input_text)))
     return ne_tree
示例#45
0
import nltk
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import state_union

train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

cst = PunktSentenceTokenizer(train_text)
tknd = cst.tokenize(sample_text)

try:
    for i in tknd[5:]:
        word = nltk.word_tokenize(i)
        # print(word)
        tgd = nltk.pos_tag(word)
        nER = nltk.ne_chunk(tgd)
        print(nER)

except Exception as e:
    print(str(e))

示例#46
0
sentences = nltk.sent_tokenize(article)
 
len(sentences)

# Tokenize each sentence into words: token_sentences
token_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# this is actually broken up into lists of lists....
len(token_sentences)
token_sentences[0]


# Tag each tokenized sentence into parts of speech: pos_sentences
pos_sentences = [nltk.pos_tag(sent) for sent in token_sentences] 

print nltk.ne_chunk(pos_sentences[2], binary = True)

len(pos_sentences)

# Create the named entity chunks: chunked_sentences
chunked_sentences = nltk.ne_chunk_sents(pos_sentences, binary=True)

# Test for stems of the tree with 'NE' tags
for sent in chunked_sentences:
    for chunk in sent:
        if hasattr(chunk, "label") and chunk.label() == "NE":
            print(chunk)
            
test = []
for sent in chunked_sentences:
    for chunk in sent:
示例#47
0
def answer_processing(s_tuple, q_type, q_keywords, dependency):
	#print "DOING ANSWER_PROCESSING"
	sentences = s_tuple
	print len(sentences)
	# http://nbviewer.jupyter.org/github/gmonce/nltk_parsing/blob/master/1.%20NLTK%20Syntax%20Trees.ipynb
	# in string
	answers = []

	grammar_passed_answers = []
	grammar_failed_answers = []


	# NEED TO ACCOUNT FOR CASES IN WHICH THERE ARE LESS THAN 5 ANSWERS
	num_answers_needed = 5 - len(sentences)
	if(num_answers_needed > 0):
		for i in range(0,num_answers_needed):
			sentences.append(('100','nil'))
	for i in range(0, min(10, len(sentences))):
		doc_num = sentences[i][0]
		sentence = sentences[i][1]
		if q_type == WHEN_TYPE:
			sentence_after_tagging = timex.tag(sentence)
			when_answers = re.findall('<TIMEX2>(.*?)</TIMEX2>', sentence_after_tagging)
			# in case answer comes out as empty, output an empty string
			when_answer = when_answers[0] if len(when_answers) != 0 else 'nil'
			answers.append((doc_num, when_answer))

		else:		
			words = nltk.word_tokenize(sentence)
			pos_tag = nltk.pos_tag(words)
			ner_tree = nltk.ne_chunk(pos_tag)
			#print ner_tree
			# the list of tuples((word, pos),ner) to be considered for this sentence
			matching_tuples = []
			# print q_keywords
			global subtree
			tmp = []
			for subtree in ner_tree.subtrees():
				#if subtree.label() in NER_TAG[q_type] and subtree.pos()[0][0][1]=='NNP':	
				if subtree.label() in NER_TAG[q_type]:					

					word = ' '.join(map(lambda x : x[0][0], subtree.pos()))
					#print word
					#print q_keywords
					iskwin = map(lambda x : x in word, q_keywords)
					if not any(iskwin):						
						# print "SUBTREE!", subtree
						# matching_tuples = subtree.pos()
						answer = ' '.join(map(lambda x : x[0][0], subtree.pos()))
						if answer not in map(lambda x : x[1],answers):
							tmp.append(answer)
			'''
			if(len(tmp) > 0 and dependency != '' and q_type == WHO_TYPE):
				try:
					p, f = grammar_stuff(tmp,sentence, dependency, doc_num)
					grammar_passed_answers += p
					grammar_failed_answers += f
				except:
					for answer in tmp:
						grammar_failed_answers.append((doc_num, answer))
			else:
				for answer in tmp:
					grammar_failed_answers.append((doc_num,answer))
			'''
			for answer in tmp:
				if answer not in map(lambda x : x[1],grammar_failed_answers):

					grammar_failed_answers.append((doc_num,answer))
			
					
			#print "SENTENCE : ", sentence, "ANSWER : ", tmp
			# t : ((word, pos), ner)
			# answer = ''
			# for t in matching_tuples:
			# 	#print t
			# 	if t[0][0] not in q_keywords:
			# 		answer += t[0][0] + ' '
			# # remove any possible trailing whitespaces
			# answer = answer.rstrip()
			# answers.append((doc_num,answer))

	answers += grammar_passed_answers + grammar_failed_answers
	print 'ANSWERS!!!!!'
	print answers

	return answers
示例#48
0
        date_clean.append(date.today() + datetime.timedelta(days=d_plus * 30))

    else:
        date_clean.append(parser.parse(d[0]))

print(date_clean)

# ## NLTK NER
# Chunking?

# In[99]:

#nltk.download('maxent_ne_chunker')
#nltk.download('words')
from nltk import ne_chunk, pos_tag
chunked = ne_chunk(loc_tag)

print(chunked)

# ## Text Categorizer

# ## Custom Components

# ## Logic Engine to parse NE

# In[100]:

# If no second loaction, ask for start location

# If no second date, assume one way
示例#49
0
taggedToken = pos_tag(token1)
print(taggedToken[:20])

# # 영문 개체명인식

# In[33]:

nltk.download('words')
nltk.download('maxent_ne_chunker')

# In[34]:

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# In[37]:

#토큰화
token1 = word_tokenize("Barack Obana likes fried chicken very much")
print('token1', token1)

taggedToken = pos_tag(token1)
print('pi=os_tag', taggedToken)

from nltk import ne_chunk
neToken = ne_chunk(taggedToken)
print(neToken)

# In[ ]:
示例#50
0
#!/usr/bin/python# -*- coding: utf-8 -*-

from nltk import word_tokenize, pos_tag, ne_chunk

sentence = "Mark and John are working at Google."

print(ne_chunk(pos_tag(word_tokenize(sentence))))
for token in tex:
    print(nltk.pos_tag([token]))

############################################################################################################
#### Named entity recognition
############################################################################################################
# Es el proceso de detectar las entidades nombradas como el nombre de la persona, el nombre de la ubicación,
# el nombre de la empresa, las cantidades y el valor monetario.
text = "Google’s CEO Sundar Pichai introduced the new Pixel at Minnesota Roi Centre Event"
#importing chunk library from nltk
from nltk import ne_chunk
nltk.download('maxent_ne_chunker')
nltk.download('words')
# tokenize and POS Tagging before doing chunk
token = word_tokenize(text)
tags = nltk.pos_tag(token)
chunk = ne_chunk(tags)
print(chunk)

############################################################################################################
#### Chunking
############################################################################################################
# El "chunking" significa recoger trozos individuales de información y agruparlos en trozos más grandes.
# En el contexto de la NLP y la minería de textos, "chunking" significa una agrupación de palabras o tokens en trozos.
text = "We saw the yellow dog"
token = word_tokenize(text)
tags = nltk.pos_tag(token)
reg = "NP: {<DT>?<JJ>*<NN>}"
a = nltk.RegexpParser(reg)
result = a.parse(tags)
print(result)
示例#52
0
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')


nltk.pos_tag("Machine Learning is great".split())

from nltk.stem.porter import PorterStemmer
porter_stemmer=PorterStemmer()

print(porter_stemmer.stem('wolves'))

from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
print(lemmatizer.lemmatize('wolves'))

s="Albert Einstein was born on March 14,1879"
tags=nltk.pos_tag(s.split())
print(tags)

nltk.ne_chunk(tags).draw()
print(nltk.ne_chunk(tags))
示例#53
0
text1 = 'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.'
text2="Please advise on the options the deceased clients wife has in relation to this pension" \
     "   She wishes to exercise ARF option if available "
text="Hi I was trying to register online but I was n t recognised " \
     "  My  France number is 4824461      " \
     "Looking to register on Pension Planet Robert Manning" \
     "   but Irish Ronnie Gardner website ca n t find my details        " \
     "Richard Wade "
text = 'How can I pay my car renewal'
tokenized_text = word_tokenize(text)
ner_st = st.tag(tokenized_text)
print(ner_st)

pos_st = post.tag(tokenized_text)
print(pos_st)
exit()
pos_nltk = nltk.pos_tag(tokenized_text)
print(pos_nltk)

blob = TextBlob(text)
print(blob.tags)
print("tree stanford\n")
print("type of chunk", type(ne_chunk(pos_st)))

print("type of tree", len(tree2conlltags(ne_chunk(pos_st))))
print("tree nltk\n")
print(tree2conlltags(ne_chunk(pos_nltk)))
print("tree blob\n")
print(ne_chunk(pos_nltk))
print(tree2conlltags(ne_chunk(blob.tags)))
exit()
示例#54
0
def entities(text):
    return ne_chunk(pos_tag(word_tokenize(text)))
                if (type(items) == nltk.tree.Tree):
                    # word = str(items[0]) + ' - ' + str(items.label())
                    word = str(items[0])
                elif (type(items) == unicode):
                    word = str(items)
                    if (word.find('.') != -1):
                        end_of_sentence = True

                sentence = sentence + word + " "
                if (end_of_sentence):
                    sentence = sentence[:len(sentence) - 1]
                    sentence_list.append(sentence)

                    text = nltk.word_tokenize(sentence)
                    pos_tagged_sentence = nltk.pos_tag(text)
                    ne_chunked_sentence = nltk.ne_chunk(pos_tagged_sentence)

                    for words in ne_chunked_sentence:
                        word = None
                        pos_tag = None
                        ner = None

                        it = it + 1
                        print(it)
                        if (type(words) == nltk.tree.Tree):
                            word = words[0][0]
                            pos_tag = words[0][1]
                            ner = words.label()
                            # print(words.label(), words[0][0], words[0][1])
                        else:
                            word = words[0]
示例#56
0
)

for thing in dummyDoc.ents:
    print(thing, end=" ")
    print(thing.label_, end=" ")
    print(thing.label, end=" ")
    print("\n")

dummyComment = "The latest example deals with the cost of the United States embassy in Jerusalem. The President publicly announced the cost in March: “We’re going to have it built very quickly and inexpensively,” he said. “They put an order in front of my desk last week for $1 billion . . . We’re actually doing it for about $250,000, so check that out.” The actual cost will be almost 100 times higher, as CNN reports. A contract summary file for the embassy from the Office of Acquisitions of the Department of State (available on usaspending.gov) puts the figure at $21.2 million."
print("DUMMY COMMENT: ", dummyComment)
dummyComment = ' '.join(
    [word for word in dummyComment.split() if word not in stop])
sentences = nltk.sent_tokenize(dummyComment)

#tokenizes sentences into words --> sentences will become a 2D list [ [blah, blah, blah], [blah, blah, blah] ]
sentences = [nltk.word_tokenize(sent) for sent in sentences]

#tags each word in a sentence with a "part of speech" label --> sentences will become a 2D list with tuples
# --> [ [ (blah, yuh), (blah, yuh), (blah, yuh) ], [ (blah, yuh), (blah, yuh), (blah, yuh) ] ]
sentences = [nltk.pos_tag(sent) for sent in sentences]
print(sentences, "\n")

#stuff here
for tagged_sentence in sentences:

    for chunk in nltk.ne_chunk(tagged_sentence):

        #         print (chunk)
        if type(chunk) == nltk.tree.Tree:
            print("CHUNK: ", type(chunk))
示例#57
0
    if isinstance(document, str):
        document = document
    else:
        raise ValueError('Document is not string!')
    document = document.strip()
    sentences = nltk.sent_tokenize(document)
    sentences = [sentence.strip() for sentence in sentences]
    return sentences


# tokenize sentences
sentences = parse_document(text)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
# tag sentences and use nltk's Named Entity Chunker
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
ne_chunked_sents = [nltk.ne_chunk(tagged) for tagged in tagged_sentences]
# extract all named entities
named_entities = []
for ne_tagged_sentence in ne_chunked_sents:
    for tagged_tree in ne_tagged_sentence:
        # extract only chunks having NE labels
        if hasattr(tagged_tree, 'label'):
            entity_name = ' '.join(c[0]
                                   for c in tagged_tree.leaves())  #get NE name
            entity_type = tagged_tree.label()  # get NE category
            named_entities.append((entity_name, entity_type))
            # get unique named entities
            named_entities = list(set(named_entities))

# store named entities in a data frame
entity_frame = pd.DataFrame(named_entities,
corpus.question.dtype
corpus=pd.DataFrame(corpus)
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

##O/p we get a list of tuples containing the individual words in the 
##sentence and their associated part-of-speech
sent = preprocess(str(corpus["question"]))
print(sent) 

##Now we implement noun phrase chunking to identify named entities using 
##a regular expression consisting of rules that indicate how sentences should be chunked
pattern = 'NP: {<DT>?<JJ>*<NN>}'

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

##With the function nltk.ne_chunk(), 
##we can recognize named entities using a classifier  
ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(str(corpus))))
print(ne_tree)

示例#59
0
pattern = 'NP: {<DT>?<JJ>*<NN>}'
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
#print(cs)

# In[59]:

from nltk.chunk import conlltags2tree, tree2conlltags
nltk.download('maxent_ne_chunker')
nltk.download('words')
from pprint import pprint

iob_tagged = tree2conlltags(cs)
#print(iob_tagged)

ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(text)))
#print(ne_tree)

# In[60]:

#Using spacy to for entity recognition.

import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

doc = nlp(text)
#print([(X.text, X.label_) for X in doc.ents])
#print([(X, X.ent_iob_, X.ent_type_) for X in doc])
示例#60
0
filepaths.sort()

corpus = [open(f, 'r').read() for f in filepaths]
corpus = np.array(corpus)

dump = ''
entities = []
organizations = []
for j in range(corpus.shape[0]):
    dump += corpus[j]

tokenized = nltk.word_tokenize(dump)
tagged = nltk.pos_tag(tokenized)

## Generating a list of all entities
namedEnt = nltk.ne_chunk(tagged)
for i in namedEnt:
    if type(i) == Tree:
        for subtree in i.subtrees():
            name = ''
            for leaf in subtree.leaves():
                leaf_parts = list(leaf[0])
                for part in leaf_parts:
                    name += part
                name += ' '

            if subtree.label() == 'PERSON' and len(subtree) > 1:

                if name not in entities:
                    entities.append(name)