def tree2brackets(tree): str, tag = '', '' for item in tree2conlltags(tree): if item[2][0] in {'B', 'O'} and tag: str += tag +'] ' tag = '' if item[2][0] == 'B': tag = item[2].split('-')[1] str += '[' str += item[0] +' ' if tag: str += tag +'] ' return str.strip()
def generate_candidate(texts, method='word', remove_punctuation=False): """ Generate word candidate from given string Parameters ---------- texts: str, input text string method: str, method to extract candidate words, either 'word' or 'phrase' Returns ------- candidates: list, list of candidate words """ words_ = list() candidates = list() # tokenize texts to list of sentences of words sentences = sent_tokenize(texts) for sentence in sentences: if remove_punctuation: sentence = punct_re.sub(' ', sentence) # remove punctuation words = word_tokenize(sentence) words = list(map(lambda s: s.lower(), words)) words_.append(words) tagged_words = pos_tag_sents(words_) # POS tagging if method == 'word': tags = set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS']) tagged_words = chain.from_iterable(tagged_words) for word, tag in tagged_words: if tag in tags and word.lower() not in stop_words: candidates.append(word) elif method == 'phrase': grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}' chunker = RegexpParser(grammar) all_tag = chain.from_iterable([tree2conlltags(chunker.parse(tag)) for tag in tagged_words]) for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'): candidate = ' '.join([word for (word, pos, chunk) in group]) if key is True and candidate not in stop_words: candidates.append(candidate) else: print("Use either 'word' or 'phrase' in method") return candidates
def extract_keyphrases(self, document): """ For a document, parse sentences using our chunker created by our grammar, converting the parse tree into a tagged sequence. Yields extracted phrases. """ for sents in document: for sent in sents: sent = self.normalize(sent) if not sent: continue chunks = tree2conlltags(self.chunker.parse(sent)) phrases = [ " ".join(word for word, pos, chunk in group).lower() for key, group in groupby( chunks, lambda term: term[-1] != 'O' ) if key ] for phrase in phrases: yield phrase
def extract_candidate_chunks(sents, grammar=GRAMMAR, tagged=False, **kwargs): """ Extracts key chunks based on a grammar for a list of tokenized sentences. If the sentences are already tokenized and tagged, pass in: tagged=True """ normalizer = Normalizer(**kwargs) chunker = RegexpParser(grammar) for sent in sents: # Tokenize and tag sentences if necessary if not tagged: sent = nltk.pos_tag(nltk.wordpunct_tokenize(sent)) # Parse with the chunker if we have a tagged sentence if not sent: continue chunks = tree2conlltags(chunker.parse(sent)) # Extract candidate phrases from our parsed chunks chunks = [ " ".join(word for word, pos, chunk in group).lower() for key, group in groupby( chunks, lambda (word, pos, chunk): chunk != 'O' ) if key ]
from nltk import word_tokenize, pos_tag, ne_chunk from nltk.chunk import tree2conllstr, conllstr2tree, conlltags2tree, tree2conlltags import nltk text = "Fly me from Seattle to Tampa" tokens = word_tokenize(text) tagged_tokens = pos_tag(tokens) ner_tree = ne_chunk(tagged_tokens) print(ner_tree) iob_tagged = tree2conlltags(ner_tree) print(iob_tagged)
def preprocess(sent): sent = nltk.word_tokenize(sent) sent = nltk.pos_tag(sent) return sent sent1 = preprocess(ex1) sent2 = preprocess(ex2) cs1 = cp.parse(sent1) cs2 = cp.parse(sent2) print(cs1) print(cs2) iob_tagged1 = tree2conlltags(cs1) pprint(iob_tagged1) iob_tagged2 = tree2conlltags(cs2) pprint(iob_tagged2) ne_tree1 = nltk.ne_chunk(pos_tag(word_tokenize(ex1))) print(ne_tree1) ne_tree2 = nltk.ne_chunk(pos_tag(word_tokenize(ex2))) print(ne_tree2) doc1 = nlp(ex1) print('Named Entities for scentence1:') pprint([(X.text, X.label_) for X in doc1.ents])
def chunkparser(self, pattern='NP: {<DT>?<JJ>*<NN>}'): cp = nltk.RegexpParser(pattern) cs = cp.parse(self.sent) iob_tagged = tree2conlltags(cs) self.iob_tagged = iob_tagged
def fn_preprocess(art): art = nltk.word_tokenize(art) art = nltk.pos_tag(art) return art art_processed = fn_preprocess(article) results = ne_chunk(art_processed) # for x in str(results).split('\n'): # if '/NN' in x: # print(x) pattern = 'NP: {<DT>?<JJ>*<NN>}' cp = nltk.RegexpParser(pattern) cs = cp.parse(art_processed) # print(cs) iob_tagged = tree2conlltags(cs) # pprint(iob_tagged) namedEntities = [] for word, pos, ner in iob_tagged: namedEntities.append(ner) # print(word, pos, ner) print('Named Entites in Document') print(len(namedEntities))
sent = "" labels = [] # try: for word in tsvin: word = word.split("\t") word = [w.replace("\n", "") for w in word] if word[0] == '': splitted = sent.split(" ") splitted = [str.strip(w) for w in splitted] # splitted = [re.sub('[^A-Za-z0-9]+', '', w) for w in splitted] splitted = [w for w in splitted if len(w) >= 1] # print splitted X_test_final.append( sent2features((tree2conlltags(ne_chunk(pos_tag(splitted)))))) y_test_final.append(labels) sent = "" labels = [] else: # if len(word[0].split(" ")) > 1: # print word[0].split(" ") sent = sent + " " + str.strip(word[0]) labels.append(word[1]) # except: # print # with open(CONST_WIKI_ALL,'rb') as tsvin, open('new.csv', 'wb') as csvout: # tsvin = csv.reader(tsvin, delimiter='\t') # for word in tsvin: # print word
def find_elements(text, full=False, trim=True, low_trim_limit=2, high_trim_limit=2000): sent = nltk.pos_tag(nltk.word_tokenize(text)) elements = dict() if full: #do all nouns for x in sent: if x[1] == "NN" or x[1] == "NNS" or x[1] == "NNP" or x[ 1] == "NNPS" or x[1] == "PRP": elements[x[0].lower()] = 0 else: #do only NE + extra for x in sent: if x[1] == "PRP": elements[x[0].lower()] = 0 pattern = 'NP: {<DT>?<JJ>*<NN>}' cp = nltk.RegexpParser(pattern) cs = cp.parse(sent) iob_tagged = tree2conlltags(cs) parsed = pformat(iob_tagged) parsed = ast.literal_eval(parsed) tempString = "" for x in parsed: if x[2] == 'B-NP' or x[2] == 'I-NP': tempString = tempString + x[0].lower() + " " if x[2] == 'O' and len(tempString) > 0: tempString = tempString.rstrip() tempString = re.sub( "^a ", "", re.sub( "^an ", "", re.sub( "^no ", "", re.sub("^this ", "", re.sub("^the ", "", tempString))))) elements[tempString.rstrip().lower()] = 0 tempString = "" doc = nlp(text) parsed = pformat([(X.text, X.label_) for X in doc.ents]) parsed = ast.literal_eval(parsed) for x in parsed: if x[1] == 'PERSON' or x[1] == 'ORG' or x[1] == 'PRODUCT' or x[ 1] == 'LOC' or x[1] == 'FAC': tempString = x[0].lower().replace('a ', '').replace( 'an ', '').replace('no ', '').replace('this ', '').replace('the ', '').replace('\n', '') elements[tempString] = 0 if trim: text = text.lower() for x in elements.keys(): elements[x] = my_count(text, x) elements = {k: v for k, v in elements.items() if v > low_trim_limit} elements = {k: v for k, v in elements.items() if v <= high_trim_limit} pprint(elements) return elements
def __init__(self, train_sents, *args, **kwargs): tag_sents = [tree2conlltags(sent) for sent in train_sents] train_chunks = [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents] self.tagger = ConsecutiveNPChunkTagger.train(train_chunks, *args, **kwargs)
def tag_bio(self): #print(ne_chunk(pos_tag(self._ts_abs_word_tokens))) iob_tagged = tree2conlltags(ne_chunk(pos_tag(self._ts_abs_word_tokens))) print(iob_tagged)
from nltk import word_tokenize, pos_tag, ne_chunk from nltk.chunk import tree2conlltags sentence = ''' Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very close to the Manhattan Bridge which is visible from the window. ''' print(tree2conlltags(ne_chunk(pos_tag(word_tokenize(sentence)))))
} return list(features.values()) if __name__ == "__main__": # transformed = [list(map(lambda x: ((x[0], x[1]), x[2]), s)) for s in chunked_sents] # random.shuffle(transformed) # train_sents = transformed[:int(len(transformed) * 0.9)] # test_sents = transformed[int(len(transformed) * 0.9 + 1):] # from nltk.stem.snowball import SnowballStemmer file_path = sys.argv[1] chunked_sents = [tree2conlltags(chunk.conllstr2tree(s)) for s in open(file_path).read().strip().split("\n\n")] random.shuffle(chunked_sents) train_sents = []#chunked_sents[:int(len(chunked_sents) * 0.7)] test_sents = chunked_sents[int(len(chunked_sents) * 0.7 + 1):] ### CRF Chunker chunker = CRFChunkParser(chunked_sents=train_sents, model_file="russian_chunker.crf") print(chunker.evaluate([conlltags2tree(s) for s in test_sents])) # from nltk.tag.crf import CRFTagger # chunker = CRFTagger(feature_func=feature_detector) # chunker.set_model_file("russian_chunker.crf") # chunker.train(train_sents, "russian_chunker.crf")
for x in list1: # check if exists in unique_list or not if x not in unique_list: unique_list.append(x) # print list for x in unique_list: print(x) ##### Processing on EBS Input Data file to extract dump of unique keywords iob_tagged = [] for i in range(0, len(df['PMHD_TA004_SYS_T_DES'])): #print(i) ne_tree = ne_chunk(pos_tag(word_tokenize(df['PMHD_TA004_SYS_T_DES'][i]))) iob_tagged.append(tree2conlltags(ne_tree)) s1 = [] for i in range(0, len(iob_tagged)): s1.append([i[0] for i in iob_tagged[i]]) s2 = [] for i in range(0, len(iob_tagged)): s2.append([i[1] for i in iob_tagged[i]]) s3 = [] for i in range(0, len(iob_tagged)): s3.append([i[2] for i in iob_tagged[i]]) s01 = [] for i in range(0, len(s1)):
def chunking(sentence): from nltk.chunk import conlltags2tree, tree2conlltags iob_tagged = tree2conlltags(sentence) chunked_tree =conlltags2tree(iob_tagged) return chunked_tree
def namedEntityRecognition(pos): chunked_token = ne_chunk(pos) named_entity = tree2conlltags(chunked_token) return named_entity
def rm_breaks(text, beta): #Convert to lower text = text.lower() #Remove commas text = text.replace(',', '') #Remove DOIs text = re.sub(r'\d+\.\d+/\w+', '', text) text = re.sub(r'doi:*', '', text) #Replace 'hypothesis 1' with 'h1' text = text.replace('hypotheses', 'hypothesis') text = re.sub(r'hypothesis (?=\d+)', 'h', text) #Remove numbers that dont have a character immediately before them (since H0 indicates hypothesis) text = re.sub(r'\W+\d+', '', text) text = re.sub(r'\d{2,4}', '', text) #Replace jstor link with 'jstor' placeholder, then delete text = re.sub(r'https?://.+', 'jstor.', text) # text = re.sub(r'\S+\.jstor\.\S+', 'jstor.', text)) text = re.sub(r'\.{2,}|:', '', text) text = re.sub(r'this\scontent.+', '', text) #Delete jstor placeholder check = re.sub(r'.*jstor.*', '', text) if check != '': text = re.sub(r'.*jstor.*', '', text) #Remove word interruptions text = re.sub(r'-\s*\n\s*', '', text) #Remove line breaks text = re.sub(r'\n', '', text) #Do NER and remove sentences with too many named entities sent = preprocess(text) pattern = 'NP: {<DT>?<JJ>*<NN>}' cp = nltk.RegexpParser(pattern) cs = cp.parse(sent) iob_tagged = tree2conlltags(cs) Owords = 0 wordCount = 0 maintext = [] holder = [] for i in iob_tagged: holder.append(i[0]) wordCount += 1 if i[0] == '.': score = Owords / wordCount if 'hypothesis' in holder: maintext += holder holder.clear() elif 'jstor' in holder: Owords = 0 wordCount = 0 holder.clear() continue elif score >= beta: maintext += holder Owords = 0 wordCount = 0 holder.clear() if i[2] == 'O': Owords += 1 if maintext != []: maintext = functools.reduce(lambda a, b: a + ' ' + b, maintext) return maintext
def senten_tag(sentence): ne_tree = ne_chunk(pos_tag(word_tokenize(sentence))) iob_tagged = tree2conlltags(ne_tree) return iob_tagged
if(data is None or data['message'] is None or data['message'] == ""): continue msg = str(data['message']) msg = msg.strip() if(len(msg) == 0 or english_ch.search(msg) == None): continue tokens = nltk.word_tokenize(msg) tokens = remove_blanc(tokens) tokens = remove_special(tokens) flag = "FALSE" for t in tokens: if(d.check(t)): flag = "TRUE" break x = tree2conlltags(ne_chunk(pos_tag(word_tokenize(msg)))) nerf = "N" for i in x: if(len(i) > 2 and not ("B-" in i[2] or "I-" in i[2] )): nerf = "S" break if(flag == "TRUE" or nerf == "N"): if(date[0] not in transactions_date_wise): transactions_date_wise[date[0]] = 0 transactions_date_wise[date[0]] = transactions_date_wise[date[0]] + 1 textual_transactions = textual_transactions + 1 except: continue f.close() outputfile.write("DATE #TEXTUAL_TRANSACTIONS \n")
def IOB_Tagging(t): iob_tagged = tree2conlltags(t) return iob_tagged
from nltk.chunk import conlltags2tree, tree2conlltags from nltk.corpus import stopwords import os import re filepath = "Enter file path" fin = open(filepath, 'r') fout = open('out.txt', 'w' ) text = fin.read() text = re.sub(r'[^\w\s]',' ',text) sentence=sent_tokenize(text) for x in sentence: words=word_tokenize(x) tagged_pos=pos_tag(words) namedEnt = nltk.ne_chunk(tagged_pos, binary=False) ne_tagged=(tree2conlltags(namedEnt)) for ne in ne_tagged: ner=(ne[-1]) ner1=str(ner) for tag in range(3): if tag == 0: gram = ("Nametag: {(<VBP>).*?(<JJ>?<NNP>+|<NNP>+)}") if tag == 1: gram = ("Datetag: {<CD><CD|JJ><CD>}") if tag == 2: gram = ("Qualificationtag: {<NNP>+<IN.*><NNP>} ") chunkParser = nltk.RegexpParser(gram) tree = chunkParser.parse(tagged_pos) iob_tagged=(tree2conlltags(tree)) for iob in iob_tagged:
ts = " Agent Name david member number 45678" ts3 = "123467 is davids member no" ts2 = " mark and john are working at Google" test = (nltk.pos_tag(word_tokenize(ts)), np.where(model.encode([" ".join(ts)]) >= 0, 'p', 'n').astype('|S1').tostring().decode('utf-8')) print("POS tags output by nltk") print(test[0]) #test = [('member',) ,('number', ),('is',), ('9860300',)] X_test = extract_features(test) ans = fcrf.predict_single(X_test) print(ts) print("NER tags recognized by CRF") print(ans) # compare ner tags output by stanford ner, nltk and spaCy tokenized_text = word_tokenize(ts) ner_st = nerst.tag(tokenized_text) print("stanford ner tags") print(ner_st) pos_nltk = nltk.pos_tag(tokenized_text) print("nltk tags") print(pos_nltk) print(tree2conlltags(ne_chunk(pos_nltk))) nlp = spacy.load('en_core_web_sm') doc = nlp(ts) print("spacy ner tags") print([(X.text, X.label_) for X in doc.ents]) exit()
text1 = 'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.' text2="Please advise on the options the deceased clients wife has in relation to this pension" \ " She wishes to exercise ARF option if available " text="Hi I was trying to register online but I was n t recognised " \ " My France number is 4824461 " \ "Looking to register on Pension Planet Robert Manning" \ " but Irish Ronnie Gardner website ca n t find my details " \ "Richard Wade " text = 'How can I pay my car renewal' tokenized_text = word_tokenize(text) ner_st = st.tag(tokenized_text) print(ner_st) pos_st = post.tag(tokenized_text) print(pos_st) exit() pos_nltk = nltk.pos_tag(tokenized_text) print(pos_nltk) blob = TextBlob(text) print(blob.tags) print("tree stanford\n") print("type of chunk", type(ne_chunk(pos_st))) print("type of tree", len(tree2conlltags(ne_chunk(pos_st)))) print("tree nltk\n") print(tree2conlltags(ne_chunk(pos_nltk))) print("tree blob\n") print(ne_chunk(pos_nltk)) print(tree2conlltags(ne_chunk(blob.tags))) exit()
# Prepare and print metrics for the normal metrics OO = prepare_for_metrics(119, chunker, data_set=test_samples, print_output=True) y_true, predicted = prepare_for_metrics(range(len(test_samples)), chunker) print(metrics.classification_report(y_true, predicted)) # - # An example of a user fed definition chunked = chunker.parse(pos_tag(word_tokenize(Def[0]))) D =list(filter(lambda x: isinstance(x, nltk.tree.Tree), chunked))[0] ' '.join([d[0] for d in D]) art = px.DefinitionsXML('tests/latexmled_files/1501.06563.xml') p_lst = [px.recutext_xml(p) for p in art.tag_list(tag='para')] p_vec = count_vect.transform(p_lst) preds = clf.predict(p_vec) for k,p in enumerate(p_lst): print(k,preds[k],p[:100]) print('------') chunk = tree2conlltags(chunker.parse(pos_tag(word_tokenize(p_lst[63])))) for tok in chunk: print('{:15} {:>10} '.format(tok[0], tok[2])) with open('../PickleJar/chunker.pickle', 'wb') as chunker_f: pickle.dump(chunker, chunker_f) with open('data/vectorizer.pickle', 'wb') as token_f: pickle.dump(, token_f)
def write(filename, predictor): sentence = read_sentence(filename) for s in sentence: sentence_list, label_list = process_sentence(s) sen = mergeWords(sentence_list) # print(sen) #####assign pos#############################################3 pos_list = [] # truple = tree2conlltags(ne_chunk(pos_tag(word_tokenize(sen)))) truple = tree2conlltags(ne_chunk(pos_tag(sentence_list))) # the truple contains word, pos, ner-label for item in truple: pos_list.append(item[1]) ################get words lemma and stem###################### wordnet_lemmatizer = WordNetLemmatizer() lemma_list = [] for word in sentence_list: lemma_list.append(wordnet_lemmatizer.lemmatize(word, pos="v")) stem_list = [] lancaster = LancasterStemmer() for word in sentence_list: stem_list.append(lancaster.stem(word)) # print(stem_list) #####assign consituency parent pos############################ pos_parent_list, right_sublings_list, chunk_position, left_sublings_list = parse_consituency_tree( sentence_list, predictor) # print("=========pos===") # print(len(sentence_list)) # print(len(chunk_position)) # 追加一行空行 sentence_list.append(" ") label_list.append(" ") pos_list.append(" ") pos_parent_list.append(" ") right_sublings_list.append(" ") chunk_position.append(" ") lemma_list.append(" ") stem_list.append(" ") left_sublings_list.append(" ") data = {} data["word"] = sentence_list data["label"] = label_list data["pos"] = pos_list data["chunk"] = pos_list data["pos_parent"] = pos_parent_list data["right_sublings_list"] = right_sublings_list data["chunk_position"] = chunk_position data["lemma_list"] = lemma_list data["stem_list"] = stem_list data["left_sublings_list"] = left_sublings_list df = pd.DataFrame(data) # to_filename = "word.csv" # df.to_csv(to_filename) to_file = filename.split(".tsv")[0] to_file1 = to_file + "_feature_v1" + ".tsv" df.to_csv(to_file1, sep='\t', index=False, header=False, encoding="utf8", mode='a')
sentences = [ "John is a man. He walks", "John and Mary are married. They have two kids", "In order for Ravi to be successful, he should follow John", "John met Mary in Barista. She asked him to order a Pizza" ] def gender(word): return classifier.classify(feature(word)) for sent in sentences: chunks = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary=False) stack = [] print(sent) items = tree2conlltags(chunks) #iob tagging for item in items: if item[1] == 'NNP' and (item[2] == 'B-PERSON' or item[2] == 'O'): stack.append((item[0], gender(item[0]))) elif item[1] == 'CC': stack.append(item[0]) elif item[1] == 'PRP': stack.append(item[0]) print("\t {}".format(stack)) items print(chunks)
def generate_weather_answer(self, question): """ Generate weather forecast for a selected city. At first try to extract city from user request. If not possible, then generate usual answer. Connects to openweathermap and gets forecast, then generates plot of temperature in Celsius and Fahrenheit; also shows unique weather conditions. """ # remove previous image, as it isn't needed anymore for i in glob.glob(os.path.join(os.getcwd(), '*.png')): os.remove(i) good_symbols_re = re.compile('[^a-zA-Z -]') question_cleaned = good_symbols_re.sub('', question) # Extract entities. tagged = tree2conlltags(ne_chunk(pos_tag(word_tokenize(question.title())))) cities = [i[0] for i in tagged if i[1] == 'NNP'] city = '' for c in cities: data = requests.get('http://api.openweathermap.org/data/2.5/forecast?q={0}&appid=f00cf7123615727d162770891d4fd225'.format(c)).json() if data['cod'] == '200': city = c break if city == '': return self.generate_usual_answer(question) else: forecast = requests.get('http://api.openweathermap.org/data/2.5/forecast?q={0}&appid=f00cf7123615727d162770891d4fd225'.format(city)).json() if forecast['message'] == 'city not found': return "I don't know this city!" # Generate temperature and date lists for plotting date_list = [] temp_list_c = [] temp_list_f = [] for reading in forecast['list']: date = datetime.fromtimestamp(int(reading['dt'])) temperature_c = reading['main']['temp'] - 273.15 temperature_f = reading['main']['temp'] * 9 / 5 - 459.67 date_list.append(date) temp_list_c.append(temperature_c) temp_list_f.append(temperature_f) # make chart fig, ax = plt.subplots() ax.plot_date(date_list, temp_list_c, '-', label='Celsius') ax.plot_date(date_list, temp_list_f, '-', label='Fahrenheit') ax.grid(True) plt.xticks(rotation=30) plt.yticks(range(int(min(temp_list_c)) - 1, int(max(temp_list_f) + 1), 5)) dtFmt = mdates.DateFormatter('%m/%d') ax.xaxis.set_major_formatter(dtFmt) plt.title('Temperature in {0}'.format(city)) plt.legend() # save image, so it can be sent to user plt.savefig('plot.png') # List of possible unique weather conditions weather = ', '.join(list(set([i['weather'][0]['description'] for i in forecast['list']]))) return 'Possible weather in the next few days: {0}.;{1}'.format(weather, 'plot.png')
def getSyntaxInfo(sentence): tags = pos_tag(sentence.split()) ne_tree = ne_chunk(tags) ne_tagged = tree2conlltags(ne_tree) syntax_info = [] caps_range = set(range(ord('A'), ord('Z') + 1, 1)) for i in range(len(tags)): tag = tags[i] ne_tag = ne_tagged[i][2] tag_no = tagset.index(tag[1]) sentiment_score = [0, 0, 0] wordnetTag = getWordnetTag(tag[1]) if wordnetTag is None: synset = wn.synsets(tag[0]) if len(synset) == 0: synset = None else: synset = synset[0] sentiSynset = swn.senti_synset(synset.name()) sentiment_score = [ sentiSynset.pos_score(), sentiSynset.neg_score(), sentiSynset.obj_score() ] else: synset = wn.synsets(tag[0], pos=wordnetTag) if len(synset) == 0: synset = None else: synset = synset[0] sentiSynset = swn.senti_synset(synset.name()) sentiment_score = [ sentiSynset.pos_score(), sentiSynset.neg_score(), sentiSynset.obj_score() ] start_caps = int(ord(tag[0][0]) in caps_range) allcaps = 1 for c in tag[0]: if ord(c) not in caps_range: allcaps = 0 break is_number = 0 try: n = float(tag[0]) is_number = 1 except: pass # for i in range(3): # sentiment_score[i]=sentiment_score[i]/0.25+4 iob_tag = ne_tag[0] if ne_tag == 'O': ne_tag = '' else: ne_tag = ne_tag[2:] hypernyms = [synset] last_two_synsets = [None, None] same_synset = [0, 0] if synset is not None: while len(hypernyms[-1].hypernyms()) > 0: hypernyms.append(hypernyms[-1].hypernyms()[0]) last_two_synsets = [hypernyms[-1].name(), None] same_synset[0] = int(last_two_synsets[0] == synset.name()) if len(hypernyms) > 1: last_two_synsets[1] = hypernyms[-2].name() same_synset[1] = int(last_two_synsets[1] == synset.name()) syntax_info.append([tag_no] + sentiment_score + [iob_tag, ne_tag] + last_two_synsets + same_synset + [start_caps, allcaps, is_number, len(tag[0])]) return syntax_info
import nltk from nltk.chunk import conlltags2tree, tree2conlltags sentence = 'Elon and Hawking met at SpaceX last Tuesday to discuss Artificial Intelligence' try: tokenized_sentence = nltk.word_tokenize(sentence) tagged_sentence = nltk.pos_tag(tokenized_sentence) named_entity_tree = nltk.ne_chunk(tagged_sentence) iob_tagged = tree2conlltags(named_entity_tree) ne_tree = conlltags2tree(iob_tagged) for i in ne_tree: print(i) except Exception as e: print(e)
def __init__(self, train_sentences): train_data = [[(t, c) for w, t, c in tree2conlltags(sent)] for sent in train_sentences] self.tagger = BigramTagger(train_data)
text = file.read() sentences = sentence_tokenizer.tokenize(text) persons = {} organizations = {} locations = {} geopolitical_entities = {} groups = {} facilities = {} multi_word = '' for sentence in sentences: tags = tagger.tag(tokenizer.tokenize(sentence)) ne_tree_multiclass = multiclass_ner.parse(tags) iob_tagged_multiclass = tree2conlltags(ne_tree_multiclass) for current, next_value in zip(iob_tagged_multiclass, iob_tagged_multiclass[1:]): entity, category, next_entity, next_category = current[0], current[2], next_value[0], next_value[2] if 'B-' in category and next_category != 'O': multi_word = entity continue if 'I-' in category and next_category != 'O': multi_word = multi_word + ' ' + entity continue if 'I-' in category: multi_word = multi_word + ' ' + entity
def _conll(tokens): pos_tags = nltk.pos_tag(tokens) named_entities = ne_chunk(pos_tags) return [(x[0], x[2]) for x in tree2conlltags(named_entities)]
def __init__(self): train_sents = nltk.corpus.conll2000.chunked_sents('train.txt', chunk_types=['NP']) train_data = [[(t, c) for _, t, c in tree2conlltags(sent)] for sent in train_sents] unigram_tagger = nltk.UnigramTagger(train_data) self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
from proper_nouns.funcs.utilities import parse_ner_counts from proper_nouns.funcs.utilities import parse_census_counts from proper_nouns.funcs.utilities import tokenize_string download_required_nltk_packages() all_census_names = get_all_census_names() tags = ['B-PERSON', 'I-PERSON'] census = {'truth_names': 0, 'difference': 0, 'no_names': 0} tagged = {'truth_names': 0, 'test_names': 0, 'test_minus_tagged': 0, 'tagged_minus_test': 0, 'no_names': 0} n = 0 corpus = read_gmb_corpus('tags') for tagged_tokens in corpus: sentence = ' '.join([iob[0] for iob in tagged_tokens]) test_the_tokens = tokenize_string(sentence) ne_tree = ne_chunk(test_the_tokens) test_tagged_tokens = tree2conlltags(ne_tree) ner_counts = is_person_tagged(tagged_tokens, test_tagged_tokens, tags) parse_ner_counts(ner_counts, tagged) census_counts = people_in_census(tagged_tokens, all_census_names, tags) parse_census_counts(census_counts, census) n += 1 if n % 2000 == 0: print_intermediate_results(n, tagged, census)
# (NP September/NNP) # ,/, # due/JJ # (PP for/IN) # (NP release/NN) # (NP tomorrow/NN) # ,/, # (VP fail/VB to/TO show/VB) # (NP a/DT substantial/JJ improvement/NN) # (PP from/IN) # (NP July/NNP and/CC August/NNP) # (NP 's/POS near-record/JJ deficits/NNS) # ./.) from nltk.chunk import tree2conlltags iob_tagged = tree2conlltags(chunked_sentence) print(iob_tagged) # [ # ('Confidence', 'NN', 'B-NP'), # ('in', 'IN', 'B-PP'), # ('the', 'DT', 'B-NP'), # ('pound', 'NN', 'I-NP'), # ('is', 'VBZ', 'B-VP'), # ('widely', 'RB', 'I-VP'), # ('expected', 'VBN', 'I-VP'), # ('to', 'TO', 'I-VP'), # ('take', 'VB', 'I-VP'), # ('another', 'DT', 'B-NP'), # ('sharp', 'JJ', 'I-NP'), # ('dive', 'NN', 'I-NP'),
def __init__(self, train_sents, *args, **kwargs): tag_sents = [tree2conlltags(sent) for sent in train_sents] train_chunks = [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents] self.tagger = ClassifierTagger.train(train_chunks, *args, **kwargs)
def __init__(self, trainingChunkedSents): trainingData = [ [(posTag, bioTag) for word, posTag, bioTag in tree2conlltags(chunkedSent)] for chunkedSent in trainingChunkedSents ] self.tagger = TrigramTagger(trainingData)
__author__ = 'User' """ conll2002 is in Duch and Spanish so its not woriking well with that """ from nltk.corpus import conll2002 from nltk.chunk import tree2conlltags import pandas as pd from evaluate import evaluate from mit_ie.mitie_series_ner_extractror import mitie_extract_ner_series from stanford_ner.stanford_series_ner_extractor import stanford_extract_ner_series chunked_words = tree2conlltags(conll2002.chunked_words()) df = pd.DataFrame(chunked_words, columns=['word', 'tmp', 'real_tag']) # remove tmp col df = df.loc[:, ["word", "real_tag"]] # strip first two chars - "B-..." and "I-..." df['real_tag'] = map(lambda x: x[2:] if len(x) > 2 else x, df['real_tag']) # testing df = df[:5000] df.real_tag = list(df.real_tag) df.word = map(unicode, df.word) # df = add_dataframe_ner_tags(corpus_df=df, ner_extractor=mitie_extract_ner_series) # # print('###### MIT IE NER evaluation #####')