def get_ner_tags(self): sys.path.append('../preprocess') from nltk.tag.stanford import StanfordNERTagger st = StanfordNERTagger( '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') tokenized_list = [ct.split() for ct in self.cleaned_data] NERTags = st.tag_sents(tokenized_list) n = [] for nt in NERTags: n.extend(nt) ids = [] #get the indexes of all words that have NER tags ids = [i for a, i in zip(n, range(len(n))) if a[1] != "O"] a = np.array(ids) consecutive_ids = np.split(a, np.where(np.diff(a) != 1)[0] + 1) phrases = [] for ci in consecutive_ids: phrase = "" tag = "" for id_ in ci: phrase += "{} ".format(n[id_][0]) tag += "{}".format(n[id_][1]) phrases.append(phrase) cleaned_phrases = self.del_repeat(phrases) return cleaned_phrases
def get_ner_tags(self): sys.path.append('../preprocess') from nltk.tag.stanford import StanfordNERTagger st = StanfordNERTagger( '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar') tokenized_list = [ct.split() for ct in self.cleaned_data] NERTags = st.tag_sents(tokenized_list) tags = [nt for nt in NERTags] ids = [[i for a, i in zip(t, range(len(t))) if a[1] != "O"] for t in tags] phrases = [] for i, t in zip(ids, tags): phrase = "" tt = "N/A" for p, index in zip(i, range(len(i))): if index == len(i) - 1: phrase += "{}".format(t[p][0]) tt = phrase, t[p][1] else: phrase += "{} ".format(t[p][0]) phrases.append(tt) return phrases
def NERWithOldStanford(input_sample): java_path = "C:\Program Files (x86)\Common Files\Oracle\Java\javapath\java.exe" #"C:/Program Files/Java/jdk1.8.0_161/bin/java.exe" os.environ['JAVAHOME'] = java_path tagger = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz', 'stanford-ner.jar', encoding='utf-8') tokenized_text = word_tokenize(input_sample) classified_paragraphs_list = tagger.tag_sents([tokenized_text]) formatted_result = formatted_entities(classified_paragraphs_list) return formatted_result
def extract_events2(self, tweet_sentences): path_to_jar = 'lib/stanford_parser/stanford-parser.jar' path_to_models_jar = 'lib/stanford_parser/stanford-english-corenlp-2018-02-27-models.jar' path_to_ner_tagger = 'lib/stanford_ner/stanford-ner.jar' path_to_ner_model = 'lib/stanford_ner/english.all.3class.distsim.crf.ser.gz' sentence_preprocessor = Preprocessor(['remove_non_letters']) ner_tagger = StanfordNERTagger(path_to_ner_model, path_to_ner_tagger) dependency_parser = StanfordDependencyParser( path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar) events = [] chunks = list( self.utilities.chunkify_list(data_list=tweet_sentences, items_per_chunk=1000)) for chunk in chunks: created_ats = [] sentences = [] for chunk_item in chunk: created_ats.append(chunk_item[0]) sentences.append( sentence_preprocessor.preprocess(chunk_item[1])) chunk_sent_dependencies = dependency_parser.raw_parse_sents( sentences) chunk_sent_ner_tags = ner_tagger.tag_sents( [sentence.split() for sentence in sentences]) for sent_dependencies, sent_ner_tags, created_at in zip( chunk_sent_dependencies, chunk_sent_ner_tags, created_ats): dependencies = [ list(parse.triples()) for parse in sent_dependencies ] if len(dependencies) > 0 and dependencies[0] is not None: sentence_events = self.extract_events_from_stanford_dependencies( dependencies[0], sent_ner_tags) if len(sentence_events) > 0: for sentence_event in sentence_events: events.append((created_at, sentence_event)) return events
bestSentenceText[i] = bestSentenceText[i].replace(" ,", ",") bestSentenceTokensNoStopWords.append(bestSentenceText[i]) allBestSentences.append(bestSentenceTokensNoStopWords) ##################### best = result[0][0] bestSentence[articleNo,questionNo] = best if qa['answer'] in bestSentenceText: correctSentence += 1 else: allBestSentences.append([]) #to preserve question sequence print("The accuracy on dev set is", (correctSentence/float(totalQuestions))) NER_tagged = st.tag_sents(allBestSentences) print('NER Tagging Done') f = open('bestSentencesTagged.bin', 'wb') # 'wb' instead 'w' for binary file pickle.dump(NER_tagged, f, -1) # -1 specifies highest binary protocol f.close() print("NER Saved") else: #NER tagged found f = open('bestSentencesTagged.bin', 'rb') # 'rb' for reading binary file NER_tagged = pickle.load(f)
print(idx, sep=' ') # replace by hyper-tags text = re.sub(r'http[^\s]+', 'URL', u) text = re.sub(r'www\.[^\s]+', 'URL', text) text = re.sub(r'[^\s]+@[^\s]+\.[^\s]+', 'EMAIL', text) text = re.sub(r'\d+', 'NUM', text) while re.findall(r'[\.,\?\(\)\[\]:/\!_\"]', text) != []: # text = re.sub(r'([\s])[\.,\?\(\)\[\]\:]', '\1', text) # text = re.sub(r'[\.,\?\(\)\[\]\:]([\s])', '\1', text) text = re.sub(r'[\.,\?\(\)\[\]:/\!_\"]', ' ', text) tokenized_utterance = [] tokenized_utterance.append(word_tokenize(text)) classified_paragraphs_list = tagger.tag_sents(list(tokenized_utterance)) # replace NER tokens by their NER-type processed = [ stemmer.stem(x[0]) if x[1] == 'O' else x[1] for x in classified_paragraphs_list[0] ] ner_processed = [x.lower() if x.upper() != x else x for x in processed] # remove redundant NER tags: for i in range(len(ner_processed)): try: if ner_processed[i - 1] == ner_processed[i] and i > 0: ner_processed.pop(i) except: pass
class BasicDataProcessor: def __init__(self, config, data): self.config = config self.data = data self.lemmatizer = WordNetLemmatizer() self.tagger = StanfordNERTagger( model_filename=self.config.ner_model_path) self.postagger = StanfordPOSTagger( path_to_jar=self.config.pos_jar_path, model_filename=self.config.pos_model_path) self.dependency_parser = StanfordDependencyParser( path_to_jar=self.config.parser_jar_path, path_to_models_jar=self.config.parser_model_path) self.nlp = StanfordCoreNLP("stanford/stanford-corenlp-full") # self.nlp = StanfordCoreNLP("http://localhost", port=9000) self.punc = r"""!"#&'()*+;<=>?[]^`{}~""" def preprocess_questions(self, questions): return [self.preprocess_question(q) for q in questions] def process_docs(self, docs): return [self.preprocess_doc(doc) for doc in docs] def preprocess_question(self, question): normal_tokens = word_tokenize( question.replace("\u200b", '').replace("\u2014", '')) remove_punc_tokens = [ token for token in normal_tokens if not self.is_pure_puncs(token) ] remove_punc_in_tokens = [ self.remove_punc_in_token(token) for token in remove_punc_tokens ] lower_tokens = self.lower_tokens(remove_punc_in_tokens) remove_stop_tokens = self.remove_stop_words(lower_tokens) for i in range(len(remove_stop_tokens)): if remove_stop_tokens[i] == 'where': remove_stop_tokens[i] = 'location' if remove_stop_tokens[i] == 'when': remove_stop_tokens[i] = 'time' if remove_stop_tokens[i] == 'who' or remove_stop_tokens[ i] == 'whom': remove_stop_tokens[i] = 'person' if remove_stop_tokens[i] == 'why': remove_stop_tokens[i] = 'reason' lemmatized_tokens = self.lemmatize_tokens(remove_stop_tokens) return lemmatized_tokens def is_pure_puncs(self, token): if all([c in punctuation for c in token]): return True return False # remove punctuations within a token def remove_punc_in_token(self, token): return ''.join([x for x in token if x not in punctuation]).strip() # remove punctuations within a token if the punctuation is not in puc set def remove_punc_in_token_for_rule(self, token): return ''.join([x for x in token if x not in self.punc]).strip() def remove_stop_words(self, words): return [ word for word in words if word.lower() not in stopwords.words("english") ] def lemmatize_tokens(self, words): return [self.lemmatize(word.lower()) for word in words] def lemmatize(self, word): word = word.lower() lemma = self.lemmatizer.lemmatize(word, 'v') if lemma == word: lemma = self.lemmatizer.lemmatize(word, 'n') return lemma def preprocess_doc(self, doc): normal_tokens = [ word_tokenize(par.replace(u"\u200b", '').replace(u"\u2014", '')) for par in doc ] remove_punc_tokens = [[ token for token in tokens if not self.is_pure_puncs(token) ] for tokens in normal_tokens] remove_punc_in_tokens = [[ self.remove_punc_in_token(token) for token in tokens ] for tokens in remove_punc_tokens] lower_tokens = [ self.lower_tokens(tokens) for tokens in remove_punc_in_tokens ] remove_stop_tokens = [ self.remove_stop_words(tokens) for tokens in lower_tokens ] lemmatized_tokens = [ self.lemmatize_tokens(tokens) for tokens in remove_stop_tokens ] return lemmatized_tokens def lower_tokens(self, words): return [word.lower() for word in words] def process_sent(self, sens): normal_tokens = word_tokenize( sens.replace("\u200b", '').replace("\u2014", '')) remove_punc_tokens = [ token for token in normal_tokens if not self.is_pure_puncs(token) ] remove_punc_in_tokens = [ self.remove_punc_in_token(token) for token in remove_punc_tokens ] ner_tags = self.sens_ner_tagging(remove_punc_in_tokens) replaced_tokens = [ 'number' if tup[1] == 'NUMBER' else 'person' if tup[1] == 'PERSON' else 'location' if tup[1] == 'LOCATION' else tup[0].lower() for tup in ner_tags ] lower_tokens = self.lower_tokens(replaced_tokens) remove_stop_tokens = self.remove_stop_words(lower_tokens) lemmatized_tokens = self.lemmatize_tokens(remove_stop_tokens) return lemmatized_tokens def lower_tokens(self, words): return [word.lower() for word in words] def sens_ner_tagging(self, sent): ner_sents = self.tagger.tag_sents([sent]) pos_sent = pos_tag(sent) ner_sent = ner_sents[0] processed_ner_sent = [] for j in range(len(ner_sent)): span, tag = ner_sent[j] _, pos = pos_sent[j] if span.isdigit() or pos == 'CD': processed_ner_sent.append((span, 'NUMBER')) elif tag == 'PERSON': processed_ner_sent.append((span, 'PERSON')) elif tag == 'LOCATION': processed_ner_sent.append((span, 'LOCATION')) elif tag == 'ORGANIZATION': processed_ner_sent.append((span, 'OTHER')) elif j != 0 and tag == 'O' and span[0].isupper(): processed_ner_sent.append((span, 'OTHER')) else: processed_ner_sent.append((span, tag)) return processed_ner_sent def lemmatize_entity_name(self, entity_name): tokens = entity_name.split() tokens = self.lemmatize_tokens(tokens) return ' '.join(tokens)
def formatted_entities(classified_paragraphs_list): entities = {'organizations': list()} for classified_paragraph in classified_paragraphs_list: for entry in classified_paragraph: entry_value = entry[0] entry_type = entry[1] if entry_type == 'ORGANIZATION': entities['organizations'].append(entry_value) return entities tagger = StanfordNERTagger( '/Users/tomer.bendavid/Downloads/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz', '/Users/tomer.bendavid/Downloads/stanford-ner-2017-06-09/stanford-ner.jar', encoding='utf-8') paragraphs = ['I just bought these Thomson Reuters shoes'] tokenized_paragraphs = list() for text in paragraphs: tokenized_paragraphs.append(word_tokenize(text)) classified_paragraphs_list = tagger.tag_sents(tokenized_paragraphs) formatted_result = formatted_entities(classified_paragraphs_list) print(formatted_result)
bestSentenceTokensNoStopWords) #######------------ allBestSentencesText.append(bestSentenceText) best = result[0][0] bestSentence[articleNo, questionNo] = best if qa['answer'] in bestSentenceText: correctSentence += 1 else: allBestSentences.append([]) #to preserve question sequence allBestSentencesText.append(" ") ###########--------- allQuestionText.append(qa['question']) print("The accuracy on dev set is", (correctSentence / float(totalQuestions))) NER_tagged = stanford_NER_tagger.tag_sents(allBestSentences) print("NER Time:", ctime()) print("NER Tagging Done, Now doing POS tagging") POS_taggedAnswers = [] # POS_taggedAnswers = stanford_POS_tagger.tag_sents(allBestSentencesText) print("POS answer tagging Done") print("POS answer Time:", ctime()) POS_taggedQuestions = [] # POS_taggedQuestions = stanford_POS_tagger.tag_sents(allQuestionText) print("POS question tagging Done") print("POS question Time:", ctime()) f = open(fname, 'wb') # 'wb' instead 'w' for binary file pickle.dump( { 'NER_tagged': NER_tagged,
def runstanfordmodel(sents, tagger, model): # Prepare NER tagger with english model ner_tagger = StanfordNERTagger(model, tagger, encoding='utf8') # Run NER tagger on words return ner_tagger.tag_sents(sents)
import os import json import helpers from nltk.tag.stanford import StanfordNERTagger if not os.path.exists('scenes'): print('Scenes folder doesn\'t exist :( Run prepare.py first', end='\n') ner_tagger = StanfordNERTagger(r'english.all.3class.distsim.crf.ser.gz') #ner_tagger = StanfordNERTagger(r'ner-model.ser.gz') for filename in sorted(os.listdir('scenes'), key=helpers.natural_keys): scenefile = open('scenes/' + filename, 'r+') scene = json.load(scenefile) processed = [[word[0] for word in sentence] for sentence in scene['processed']] ner_tags = ner_tagger.tag_sents(processed) for sindex, sentence in enumerate(ner_tags): for windex, word in enumerate(sentence): scene['processed'][sindex][windex].append(ner_tags[sindex][windex][1]) #write to test file for validating scenefile.seek(0) #scenefile.close() #scenefile = open('scenes/' + filename + '-test', 'w') json.dump(scene, scenefile, indent=2) scenefile.truncate() scenefile.close()
#!/bin/env python3.5 from nltk.tag.stanford import StanfordNERTagger from nltk.internals import find_jars_within_path from nltk.tokenize import sent_tokenize import os tagger = StanfordNERTagger('data/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', 'data/stanford-ner-2015-12-09/stanford-ner.jar') tagger._stanford_jar = ':'.join(find_jars_within_path(os.getcwd() + 'data/stanford-ner-2015-12-09')) print(tagger.tag_sents([''.join([c for c in x if c not in '",:.?/!@#$%^&*()][{}~']).split() for x in sent_tokenize(input('Enter a sentence: '))]))
articleNo = -1 print("Computing NER start at:", ctime()) i = -1 for article in data: articleNo += 1 currentCorrectAnswerSents = [] currentCorrectAnswerText = [] print("Reading Article: ", articleNo + 1, '/', len(data)) for qa in article['qa']: i += 1 currentCorrectAnswerSents.append( article['sentences'][qa['answer_sentence']].split()) currentCorrectAnswerText.append(qa['answer'].split()) questions.append(qa['question']) answers.append(qa['answer']) NER_TaggedAnswerSents = NER_TaggedAnswerSents + stanford_NER_tagger.tag_sents( currentCorrectAnswerSents) NER_TaggedAnswerText = NER_TaggedAnswerText + stanford_NER_tagger.tag_sents( currentCorrectAnswerText) correctAnswerSents.append(currentCorrectAnswerSents) print("Computing NER end at:", ctime()) #saving the computed NER tags and sentences f = open(NER_CacheFIle, 'wb') # 'wb' instead 'w' for binary file pickle.dump( { 'NER_TaggedAnswerSents': NER_TaggedAnswerSents, 'correctAnswerSents': correctAnswerSents, 'NER_TaggedAnswerText': NER_TaggedAnswerText, 'questions': questions, 'answers': answers }, f, -1) # -1 specifies highest binary protocol
class EventDetector: def __init__(self): self.path_to_jar = 'lib/stanford_parser/stanford-parser.jar' self.path_to_models_jar = 'lib/stanford_parser/stanford-english-corenlp-2018-02-27-models.jar' self.path_to_ner_tagger = 'lib/stanford_ner/stanford-ner.jar' self.path_to_ner_model = 'lib/stanford_ner/english.all.3class.distsim.crf.ser.gz' self.ner_tagger = StanfordNERTagger(self.path_to_ner_model, self.path_to_ner_tagger) self.dependency_parser = StanfordDependencyParser( path_to_jar=self.path_to_jar, path_to_models_jar=self.path_to_models_jar) self.lemmatizer = WordNetLemmatizer() self.utilities = Utilities() def extract_events_from_stanford_dependencies(self, dependencies, ner_tags): entity_categories = ['PERSON', 'LOCATION', 'ORGANIZATION'] raw_events = {} for dependency in dependencies: if len(dependency) == 3: head = dependency[0] relation = dependency[1] tail = dependency[2] if head[1].startswith('VB'): event_keywords = list(raw_events.keys()) event_keyword = self.lemmatizer.lemmatize( head[0].lower(), 'v') if event_keyword not in event_keywords: raw_events[event_keyword] = {} if relation.endswith('subj'): subject_pronoun = [ 'i', 'you', 'he', 'she', 'we', 'they', 'who' ] subj_value = self.lemmatizer.lemmatize(tail[0].lower()) if tail[0].lower() in subject_pronoun: subj_value = 'PERSON' else: for ner_tag in ner_tags: if ner_tag[0] == tail[0] and ner_tag[ 1] in entity_categories: subj_value = ner_tag[1] raw_events[event_keyword]['subj'] = subj_value if relation == 'dobj': objective_pronoun = [ 'me', 'you', 'him', 'her', 'us', 'you', 'them' ] dobj_value = self.lemmatizer.lemmatize(tail[0].lower()) if tail[0].lower() in objective_pronoun: dobj_value = 'PERSON' else: for ner_tag in ner_tags: if ner_tag[0] == tail[0] and ner_tag[ 1] in entity_categories: dobj_value = ner_tag[1] raw_events[event_keyword]['dobj'] = dobj_value if relation == 'compound:prt': raw_events[event_keyword]['prt'] = tail[0] event = None for verb in list(raw_events.keys()): event_info = raw_events[verb] if len(verb) < 2 or 'subj' not in list(event_info.keys()) or len(event_info['subj']) < 2 \ or 'dobj' not in list(event_info.keys()) or len(event_info['dobj']) < 2: continue event_info['keyword'] = verb event = event_info break # return only one event return event def extract_soft_events(self, dependency_tree, dependency_relations, ner_tags): entity_categories = ['PERSON', 'LOCATION', 'ORGANIZATION'] accepted_relation_keys = [ 'nsubj', 'nsubjpass', 'amod', 'dobj', 'advmod', 'nmod', 'xcomp', 'compound:prt', 'compound', 'neg' ] keyword = self.lemmatizer.lemmatize(dependency_tree.label(), 'v') event = {'keyword': keyword} for dependency_relation in dependency_relations: if len(dependency_relation) == 3: head = dependency_relation[0] relation = dependency_relation[1] tail = dependency_relation[2] if head[0] == keyword and relation in accepted_relation_keys: event[relation] = self.lemmatizer.lemmatize( tail[0].lower()) # print(event) return event def extract_event_from_sentence(self, sentence): event = None sentence_preprocessor = Preprocessor(['remove_non_letters']) processed_sentence = sentence_preprocessor.preprocess(sentence) sent_dependencies = self.dependency_parser.raw_parse( processed_sentence) sent_ner_tags = self.ner_tagger.tag_sents([processed_sentence.split()]) dependencies = [list(parse.triples()) for parse in sent_dependencies] if len(dependencies) > 0 and dependencies[0] is not None: event = self.extract_events_from_stanford_dependencies( dependencies[0], sent_ner_tags) else: event['keyword'] = sentence return event def extract_event_from_sentences(self, sentences): events = [] sentence_preprocessor = Preprocessor(['remove_non_letters']) chunks = list( self.utilities.chunkify_list(data_list=sentences, items_per_chunk=1000)) for chunk in chunks: sentences = [] for chunk_item in chunk: sentences.append(sentence_preprocessor.preprocess(chunk_item)) chunk_sent_dependencies = self.dependency_parser.raw_parse_sents( sentences) chunk_sent_ner_tags = self.ner_tagger.tag_sents( [sentence.split() for sentence in sentences]) for sent_dependencies, sent_ner_tags, sentence in zip( chunk_sent_dependencies, chunk_sent_ner_tags, sentences): temp_sent_dependencies_1, temp_sent_dependencies_2 = itertools.tee( sent_dependencies, 2) dependency_relations = [ list(parse.triples()) for parse in temp_sent_dependencies_1 ] dependency_tree = [ parse.tree() for parse in temp_sent_dependencies_2 ][0] if len(dependency_relations) > 0 and dependency_relations[ 0] is not None and len(dependency_relations[0]) > 0: # print(sentence) event = self.extract_soft_events(dependency_tree, dependency_relations[0], sent_ner_tags) else: event = {'keyword': sentence} events.append(event) return events
def GetSummary(Filename, SummaryType, Section='All'): """ GetSummary() is used to determine the summary by product, opinions and general discussion points. Parameters ---------- Filename : path to the file to assess. SummaryType: The summary type aim to identify products(NNP), opinions(JJ) and general discussion points(NN). Returns ------- Description. """ # #read File file = open(Filename, 'r') Audio_Text = file.read() file.close() # #confirm section if (Section == 'Q & A'): Audio_Text = Get_QA(Audio_Text) elif (Section == 'Exec Speech'): Audio_Text = Get_Speech(Audio_Text) #***********word tokens***********# #word_tokenize based on the text(unit is whole text) words = word_tokenize(Audio_Text) #function to test if something is a noun is_noun = lambda pos: pos[:3 ] == SummaryType #extract part between pos1,pos2 #nouns = pd.Series([word for (word, pos) in nltk.pos_tag(words) if is_noun(pos)]).value_counts() #nltk.pos_tag(words):part-of-speech tagging , or word classes nouns = pd.Series( [word for (word, pos) in nltk.pos_tag(words) if is_noun(pos)]) # #clean temp = pd.Series(nouns).str.replace('\W', ' ') temp = temp.replace(' ', np.nan).dropna() temp = " ".join(temp) #connect nouns #tokenize nouns Nouns_Cleaned = word_tokenize(temp) #convert to lower case Nouns_Cleaned = [w.lower() for w in Nouns_Cleaned] # #get standard stop words. Review final results for other additional stop words. stop_words = stopwords.words('english') custom_words = [''] stop_words.extend(custom_words) #drop stopwords Nouns_Cleaned = [i for i in Nouns_Cleaned if i not in stop_words] #lemmetize words nouns_lemmatized = [ WordNetLemmatizer().lemmatize(w) for w in Nouns_Cleaned ] #***********find person names***********# #word_tokenize based on the sent_tokenize(unit is each sentence in the text) tokenized_sents = [ word_tokenize(sent) for sent in sent_tokenize(Audio_Text) ] # #1. Stanford Method #Stanford Method (Note, this needs to be commented and needs toe be converted to an api version and/or investigate cloud version) #java_path = C:/Program Files/Java/jdk-14.0.1/bin/java.exe #java_path = '/Library/Java/JavaVirtualMachines/jdk-11.0.2.jdk/Contents/Home/bin/java' java_path = '/usr/lib/jvm/java-11-openjdk-amd64' os.environ['JAVAHOME'] = java_path #nltk.internals.config_java('C:/Program Files/Java/jdk-14.0.1/bin/java.exe') #nltk.internals.config_java("/Library/Java/JavaVirtualMachines/jdk-11.0.2.jdk/Contents/Home/bin/java") nltk.internals.config_java('/usr/lib/jvm/java-11-openjdk-amd64/bin/java') #st = StanfordNERTagger(stanford-ner/stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz,stanford-ner/stanford-ner-2017-06-09//stanford-ner.jar) st = StanfordNERTagger( '/content/drive/MyDrive/Earnings_call_NLP/stanford-ner/english.all.3class.distsim.crf.ser.gz', '/content/drive/MyDrive/Earnings_call_NLP/stanford-ner/stanford-ner-2017-06-09/stanford-ner.jar' ) tags = st.tag_sents( tokenized_sents) #('With', 'O'),('that', 'O'),('Mike', 'PERSON'), #get person names list names_stanford = [] for tag in tags: for content in tag: if content[1] == 'PERSON': names_stanford.extend( content) #'Mike','PERSON', 'Spencer', 'PERSON', #keep names and remove 'PERSON' tag names_stanford = [i for i in names_stanford if i not in ['PERSON']] # #2.WordNet Method person_list = [] names_wordnet = person_list def get_human_names(text): tokens = nltk.tokenize.word_tokenize(text) #tag pos = nltk.pos_tag(tokens) sentt = nltk.ne_chunk(pos, binary=False) person = [] name = "" for subtree in sentt.subtrees(filter=lambda t: t.label() == 'PERSON'): for leaf in subtree.leaves(): person.append(leaf[0]) if len(person) > 1: #avoid grabbing lone surnames for part in person: name += part + ' ' if name[:-1] not in person_list: person_list.append(name[:-1]) name = '' person = [] # print (person_list) names = get_human_names(Audio_Text) for person in person_list: person_split = person.split(" ") for name in person_split: if wordnet.synsets(name): if (name in person): names_wordnet.remove(person) break names_wordnet = [word_tokenize(w) for w in names_wordnet] names_wordnet = [item for sublist in names_wordnet for item in sublist] #***********remove names***********# #Remove Duplicates names_stanford = list(dict.fromkeys(names_stanford)) names_wordnet = list(dict.fromkeys(names_wordnet)) #Convert to Lower names_stanford = [w.lower() for w in names_stanford] names_wordnet = [w.lower() for w in names_wordnet] #same names in 2 methods common_names = [i for i in names_stanford if i in names_wordnet] #Remove names from tokens Nouns_Cleaned = [i for i in Nouns_Cleaned if i not in common_names] #Clean to remove any orphaned works split as part of special character removals Nouns_Cleaned = [i for i in Nouns_Cleaned if len(i) > 2] #count frequency Description = pd.Series(Nouns_Cleaned).value_counts() return Description
from nltk.tag.stanford import StanfordNERTagger from nltk.tokenize import word_tokenize import os java_path = r"C:\Program Files (x86)\Java\jre1.8.0_241\bin\java.exe" os.environ['JAVAHOME'] = java_path english_tagger = StanfordNERTagger( 'stanford-ner-2018-10-16\\classifiers\\english.all.3class.distsim.crf.ser.gz', 'stanford-ner-2018-10-16\\stanford-ner.jar') infile = open("sample.txt", "r") my_list = [i.split('\t')[0] for i in infile.readlines()] print(my_list) list2 = list() for text in my_list: list2.append(word_tokenize(text)) print(list2) op = english_tagger.tag_sents(list2) print(op)
class StanfordNer(): def __init__(self) -> None: jar = './my_nltk/stanford-ner.jar' model = './my_nltk/english.all.3class.distsim.crf.ser.gz' # Prepare NER tagger with english model self._ner = StanfordNERTagger(model, jar, encoding='utf8') def recognize_file(self, input_filename: str, output_filename: str, token_id=0) -> None: with open(input_filename, mode="r") as input, open(output_filename, mode="w") as output: for line in input: # Tokenize: Split sentence into words words = nltk.word_tokenize(line) print(ner_tagger.tag(words)) def recognize(self, input: TextIO, writer: csv.DictWriter) -> None: lines = ''.join(input.readlines()) tokens = [ nltk.word_tokenize(sentence) for sentence in nltk.sent_tokenize(lines) ] name_entities = self._ner.tag_sents(tokens) text_possition = 0 for sentence in name_entities: current_ne_type = None current_len = 0 for token, ne_type in sentence: text_possition = lines.find(token, text_possition) if ne_type != 'O': if current_ne_type == ne_type and lines[ end:text_possition].isspace(): end = text_possition + len(token) current_len += 1 else: if current_len > 0: # Write current text writer.writerow({ "start": start, "end": end, "text": lines[start:end], "type": current_ne_type, "token_len": current_len }) # Update stats current_ne_type = ne_type current_len = 1 start = text_possition end = start + len(token) else: if current_len > 0: # Write current text writer.writerow({ "start": start, "end": end, "text": lines[start:end], "type": current_ne_type, "token_len": current_len }) # Update stats current_ne_type = None current_len = 0 text_possition += len(token)
# if len(result) > 3: # bestSentenceText = bestSentenceText + " " + article['sentences'][result[3][0]] ####### # if len(result) > 4: # bestSentenceText = bestSentenceText + " " + article['sentences'][result[4][0]] ####### allQuestionText.append( qa['question']) #saving questions too for later usage #printing out reterival accuracy, #not much used, but can guide about the theorotical accuracy limit on the final QA system print("The reterival accuracy on test set is", (correctSentence / float(totalQuestions))) #Now computing NER and other tags (like POS if needed) print("Computing NER start at:", ctime()) NER_tagged = stanford_NER_tagger.tag_sents(allBestSentences) print("NER Time 1:", ctime()) NER_tagged2 = stanford_NER_tagger.tag_sents(allSecondBestSentences) print("NER Time 2:", ctime()) print("NER Tagging Done, Now doing POS tagging") POS_taggedAnswers = [] # POS_taggedAnswers = stanford_POS_tagger.tag_sents(allBestSentencesText) ####Maybe needed for the 3rd answer ranking rule print("POS answer tagging Done") print("POS answer Time:", ctime()) POS_taggedQuestions = [] # POS_taggedQuestions = stanford_POS_tagger.tag_sents(allQuestionText) ####Maybe needed for the 3rd answer ranking rule print("POS question tagging Done") print("POS question Time:", ctime()) #saving the computed NER tags and sentences
class StanfordNERTaggerExtractor(object): """docstring for ClassName""" def __init__(self): self.st = StanfordNERTagger('intent_class_models/stanford-jars/english.all.3class.distsim.crf.ser.gz' , "intent_class_models/stanford-jars/stanford-ner.jar" ) # self.st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz' , # 'stanford-ner.jar' ) def tag_text_single(self,text): ''' :param text: :return: ''' # assert type(text) == str sents = self.st.tag(nltk.word_tokenize(text)) return sents def identify_NER_tags_single(self,text_tag,tag_to_find): ''' :param text_tag: Tagged text :param tag_to_find: :return: ''' tag_strs = [] prev_wrd_tag = False for wrd,tag in text_tag: if tag == tag_to_find: if not prev_wrd_tag: tag_strs.append(wrd) else: prev_wrd = tag_strs.pop() new_wrd = prev_wrd+' '+wrd tag_strs.append(new_wrd) prev_wrd_tag = True else: prev_wrd_tag = False tags_final = [] for wrd in tag_strs: if wrd not in tags_final: tags_final.append(wrd) return tags_final def tag_text_multi(self,text): ''' ''' tokenized_sents = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)] return self.st.tag_sents(tokenized_sents) def identify_NER_tags_multi(self,text_tag,tag_to_find): ''' ''' tag_strs = [] for sent_tag in text_tag: for wrd in self.identify_NER_tags_single(sent_tag,tag_to_find): if wrd not in tag_strs: tag_strs.append(wrd) return tag_strs def tag_text_multi_from_single(self,ner_tags): ''' converting a huge single text tags into sentence based tags this is done because tagging sentence wise is slow. so we tag the entire text and split them after''' sents = '' for wrd,_ in ner_tags: sents += wrd+' ' sent_tags = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(sents)] cnt = 0 final_tags = [] for sent_ind in range(len(sent_tags)): sent_tag_list = [] for wrd_ind in range(len(sent_tags[sent_ind])): try: sent_tag_list.append(ner_tags[cnt]) cnt += 1 except: break final_tags.append(sent_tag_list) return final_tags