def main(feature_set, algorithm="IIS", train_sample_size=0): """" The main method for using the NER-tagger. This method trains, pickles and evaluates the models, skipping the unpickling part for efficiency. This method is faster and easier than using EvaluateModels and BuildModels, but has less options. Use the flags -h or -help to get this help message. This flag will overrule any other flags. Keyword arguments: algorithm -- The name of the algorithm to use. Must be one of ["IIS", "GIS", "NaiveBayes"] (Default = IIS). The flag to set this can be -a or -alg or -algorithm in the command line train_sample_size -- The number of training samples to use as an integer. Must be between 0 and the length of the conll ned.train. (Default = full ned.train) To set this, use flag -tss or -train_sample_size in the command line test_all_features -- Boolean, whether to test all features after each other on alphabetical order. This argument can be useful when implementing multiple new features in between testing to visualize improvement. To set this, ust flag -taf or -test_all_features in the command line. This flag will overwrite feature_set if passed later in the command line. feature_set -- The list of features on which we are going to be training. Please note that every next feature calls all the previous features as well. For example, feature 3 also calls feature 2 and 1. Use the flag -feature or -f to only use a specific feature. This flag will overwrite test_all_features when passed later in the command line. """ train_data = conll.chunked_sents("ned.train") # Resize the testing size if necessary if 0 < train_sample_size < len(train_data): train_data = conll.chunked_sents("ned.train")[:train_sample_size] for feature in feature_set: # Train model(s) and pickle them. model = Bm.train_model(feature=feature, train_data=train_data, alg=algorithm) # Evaluate the models Em.evaluate_model(model)
def conllned(trace=1): """ Find the copula+'van' relation ('of') in the Dutch tagged training corpus from CoNLL 2002. """ from nltk.corpus import conll2002 vnv = """ ( is/V| was/V| werd/V| wordt/V ) .* van/Prep """ VAN = re.compile(vnv, re.VERBOSE) print print "van(PER, ORG) -- raw rtuples with context:" print "=" * 45 for doc in conll2002.chunked_sents('ned.train'): lcon = rcon = False if trace: lcon = rcon = True for rel in relextract('PER', 'ORG', doc, corpus='conll2002', pattern=VAN): print show_raw_rtuple(rel, lcon=lcon, rcon=rcon)
def train_model(feature, train_data=conll.chunked_sents("ned.train"), alg="IIS", folder="pickles"): """"Train a NER-tagger model and pickle it afterwards. Returns the trained model. Keyword arguments: alg -- The name of the algorithm to use. Must be one of ["IIS", "GIS", "NaiveBayes"] (Default = IIS). The flag to set this can be -a , -alg or -algorithm in the command line folder -- Which folder to save the pickled model(s) in (Default = "pickles") """ print() print("--------------------START TRAINING-----------------------") # Read all info of feature from the tuple feature_name = feature[0] feature_function = feature[1] # Train the model and inform the user on start time print("Training on", len(train_data), "samples, using", feature_name, " on algorithm", alg) start_time = dt.now() print("Training start time:", start_time.strftime('%d-%m-%Y %H:%M:%S.%f')[:-3]) model = ConsecutiveNPChunker(feature_function, train_data, algorithm=alg) # Inform the user on the elapsed and end times end_time = dt.now() elapsed = end_time - start_time print("Training end time:", end_time.strftime('%d-%m-%Y %H:%M:%S.%f')[:-3], "(Elapsed:", str(elapsed)[:-3] + ")") pickle_model(model=model, folder=folder) return model
def conllned(trace=1): """ Find the copula+'van' relation ('of') in the Dutch tagged training corpus from CoNLL 2002. """ from nltk.corpus import conll2002 vnv = """ ( is/V| # 3rd sing present and was/V| # past forms of the verb zijn ('be') werd/V| # and also present wordt/V # past of worden ('become) ) .* # followed by anything van/Prep # followed by van ('of') """ VAN = re.compile(vnv, re.VERBOSE) print() print("Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:") print("=" * 45) for doc in conll2002.chunked_sents('ned.train'): lcon = rcon = False if trace: lcon = rcon = True for rel in extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN): print(rtuple(rel, lcon=True, rcon=True))
def conllned(trace=1): """ Find the copula+'van' relation ('of') in the Dutch tagged training corpus from CoNLL 2002. """ from nltk.corpus import conll2002 vnv = """ ( is/V| # 3rd sing present and was/V| # past forms of the verb zijn ('be') werd/V| # and also present wordt/V # past of worden ('become) ) .* # followed by anything van/Prep # followed by van ('of') """ VAN = re.compile(vnv, re.VERBOSE) print print "Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:" print "=" * 45 for doc in conll2002.chunked_sents('ned.train'): lcon = rcon = False if trace: lcon = rcon = True for rel in extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN, window=10): print show_raw_rtuple(rel, lcon=True, rcon=True)
def relation_extraction2(): # needs POS as well as NE annotations (in Dutch) from nltk.corpus import conll2002 vnv = """ ( is/V| # 3rd sing present and was/V| # past forms of the verm zijn (be) werd/V| # and also present wordt/V # past of worden (become) ).* # followed by anything van/Prep # followed by van (of) """ VAN = re.compile(vnv, re.VERBOSE) for doc in conll2002.chunked_sents("ned.train"): for r in nltk.sem.extract_rels("PER", "ORG", doc, corpus="conll2002", pattern=VAN): # print nltk.sem.show_clause(r, relsym="VAN") print nltk.sem.show_raw_rtuple(r, lcon=True, rcon=True)
def conllesp(): from nltk.corpus import conll2002 de = """ .* ( de/SP| del/SP ) """ DE = re.compile(de, re.VERBOSE) print() print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:") print("=" * 45) rels = [rel for doc in conll2002.chunked_sents('esp.train') for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)] for r in rels[:10]: print(clause(r, relsym='DE')) print()
def conllesp(): from nltk.corpus import conll2002 de = """ .* ( de/SP| del/SP ) """ DE = re.compile(de, re.VERBOSE) print() print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:") print("=" * 45) rels = [rel for doc in conll2002.chunked_sents('esp.train') for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern=DE)] for r in rels[:10]: print(clause(r, relsym='DE')) print()
def relationExtraction(): print "page 284 7.6 Relation Extraction" import re IN = re.compile(r'.*\bin\b(?!\b.+ing)') for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'): for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN): print nltk.sem.show_raw_rtuple(rel) # failure on python 2.7 from nltk.corpus import conll2002 vnv = """ ( is/V| # 3rd sing present and was/V| # past forms of the verb zijn ('be') werd/V| # and also present wordt/V # past of worden ('become') ) .* # followed by anything van/Prep # followed by van ('of') """ VAN = re.compile(vnv, re.VERBOSE) for doc in conll2002.chunked_sents('ned.train'): for r in nltk.sem.extract_rels('PER', 'ORG', doc,corpus='conll2002', pattern=VAN): print nltk.sem.show_clause(r, relsym="VAN")
def conllesp(): from nltk.corpus import conll2002 de = """ .* ( de/SP| del/SP ) """ DE = re.compile(de, re.VERBOSE) print() print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:") print("=" * 45) rels = [ rel for doc in conll2002.chunked_sents("esp.train") for rel in extract_rels("ORG", "LOC", doc, corpus="conll2002", pattern=DE) ] for r in rels[:10]: print(show_clause(r, relsym="DE")) print()
def evaluate_model(model, testdata=conll.chunked_sents("ned.testa")): """ Evaluate a given model on test data and print the results """ print() print("-------------------START EVALUATING----------------------") # Inform the user when the evaluation has started start_time = dt.now() start_time_formatted = start_time.strftime('%d-%m-%Y %H:%M:%S.%f')[:-3] print("Evaluating on", len(testdata), "samples. Start time: ", start_time_formatted) # Evaluate the model and print the score score = model.evaluate(testdata) print(score) # Create the Evaluation-output.txt file if it does not exist if not os.path.exists("Evaluation-output.txt"): with open("Evaluation-output.txt", 'w') as file: file.write( "Datetime;Algorithm;Feature_set;Accuracy;Precision;Recall;F_Measure \n" ) # Write the results to the file with open("Evaluation-output.txt", 'a') as file: file.write(start_time_formatted + ";" + str(model._algorithm) + ";" + str(model.tagger._featuremap.__name__) + ";" + str(score.accuracy()) + ";" + str(score.precision()) + ";" + str(score.recall()) + ";" + str(score.f_measure()) + "\n") # Inform the user of the elapsed and time times end_time = dt.now() elapsed = end_time - start_time print("End time:", end_time.strftime('%d-%m-%Y %H:%M:%S.%f')[:-3], "(Elapsed:", str(elapsed)[:-3] + ")")
mode = raw_input() train_file, test_file = '', '' if language == 'spanish': train_file = 'esp.train' if mode == 'dev': test_file = 'esp.testa' else: test_file = 'esp.testb' elif language == 'dutch': train_file = 'ned.train' if mode == 'dev': test_file = 'ned.testa' else: test_file = 'ned.testb' stemmer = SnowballStemmer(language) chunked = [nltk.chunk.tree2conlltags(tree) for tree in conll2002.chunked_sents(fileids = train_file)] print 'Generating training set' train_set = [] for chunk in chunked: for i in range(len(chunk)): prev = ('', '', '') if i > 0: prev = chunk[i - 1] next = ('', '', '') if i < len(chunk) - 1: next = chunk[i + 1] data = chunk[i] feature = generate_feature(data, prev, next, i, len(chunk), stemmer) train_set.append((feature, data[-1])) chunked = [nltk.chunk.tree2conlltags(tree) for tree in conll2002.chunked_sents(fileids = test_file)] print 'Generating test set'
import pickle from nltk.corpus import conll2002 as conll import custom_chunker # """ Script that loads (unpickles) and evaluates models. Evaluation includes identificaion of the model/feature set, precision, recall, and F-measure for each model. Prints the evaluation, and exports it to Evaluation-output.txt, where notes can be added as well. """ ner = pickle.load(open("Bayestbigfulltest4", "rb")) chunksent = conll.chunked_sents("ned.testa") #[1000:1500] metrics = ner.evaluate(chunksent) #stats guess = metrics.guessed() NEinData = metrics.correct() truePos = [v[1] for v in metrics._tp] falsePos = metrics.incorrect() #fetch stats about algorithm etc. infofile = open("infofile", "r") inhoud = infofile.read() infofile.close() guessed = "Chunks guessed: " + str(len(guess)) AmountinData = "Amount of NE's in data: " + str(len(NEinData))
####关系抽取#### import re IN = re.compile(r'.*\bin\b(?!\b.+ing)') for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'): for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): print(nltk.sem.relextract.rtuple(rel)) from nltk.corpus import conll2002 vnv = """ ( is/V| # 3rd sing present and was/V| # past forms of the verb zijn ('be') werd/V| # and also present wordt/V # past of worden ('become') ) .* # followed by anything van/Prep # followed by van ('of') """ VAN = re.compile(vnv, re.VERBOSE) for doc in conll2002.chunked_sents('ned.train'): for r in nltk.sem.extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN): print(nltk.sem.relextract.rtuple(r))
# -- coding: utf-8 -- """ Created on Fri Jun 7 11:27:05 2019 @author: sepke """ from nltk.corpus import conll2002 as conll from custom_chunker import ConsecutiveNPChunker import pickle import features tiny_sample = 500 # training = conll.chunked_sents("ned.train") # Train with full dataset training = conll.chunked_sents( "ned.train") # SHORT DATASET: FOR DEMO/DEBUGGING ONLY! testing = conll.chunked_sents("ned.testa") simple_nl_NER = ConsecutiveNPChunker(features.simple_features_2, training, 'GIS') output = open("nl-GIS3.pickle", "wb") pickle.dump(simple_nl_NER, output) output.close() #simple_nl_NER2 = ConsecutiveNPChunker(features.simple_features_1, training, 'GIS') #print(simple_nl_NER2.evaluate(testing)) print(simple_nl_NER.evaluate(testing))
#!/usr/bin/python # -*- coding: utf-8 -*- from nltk.corpus import conll2000, conll2002 print(conll2000.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE for tree in conll2000.chunked_sents()[:2]: print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(conll2002.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE for tree in conll2002.chunked_sents()[:2]: print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE # SEMCOR from nltk.corpus import semcor print(semcor.words()) print(semcor.chunks()) print(semcor.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE print(semcor.chunk_sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE list(map(str, semcor.tagged_chunks(tag='both')[:3])) [[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]] # IEER from nltk.corpus import ieer ieer.fileids() # doctest: +NORMALIZE_WHITESPACE docs = ieer.parsed_docs('APW_19980314') print(docs[0]) print(docs[0].docno) print(docs[0].doctype) print(docs[0].date_time)
from custom_chunker import ConsecutiveNPChunker from features import features_simple_1 import pickle from nltk.corpus import conll2002 as conll import custom_chunker tiny_sample = 150 # training = conll.chunked_sents("ned.train") # Train with full dataset training = conll.chunked_sents("ned.train")[:tiny_sample] # SHORT DATASET: FOR DEMO/DEBUGGING ONLY! testing = conll.chunked_sents("ned.testa") simple_nl_NER = ConsecutiveNPChunker(features_simple_1, training) conll.chunked_sents("ned.train")[0] conll.chunked_sents("ned.testa")[0] print(custom_chunker.alg,custom_chunker.featurelength,len(testing),len(training)) if custom_chunker.alg == "NaiveBayes": algo = "NaiveBayes Algorithm" else: algo = "" algo += "MaxEnt " + custom_chunker.alg + " Algorithm" infofile = open("infofile", "w") filestring = algo + "\n" + str(custom_chunker.featurelength) + " features, " + str(len(testing)) + " train sentences, " + str(len(training)) + " testing" infofile.write(filestring ) infofile.close() print(simple_nl_NER.evaluate(testing)) simple_nl_NER.show_most_informative_features(20) #pickling: (don't forget to change the name if you do a diff test!!! (also in model_test)) output = open("best.pickle", "wb")
from nltk.corpus import conll2002 # Language-independent named entity recognition print(conll2002.chunked_sents()[0]) from nltk.corpus import ieer # XML documents without POS tags print(ieer.raw('APW_19980424'))
#!/usr/bin/python # -*- coding: utf-8 -*- # NAMED ENTITIES from nltk.corpus import ieer docs = ieer.parsed_docs('NYT_19980315') tree = docs[1].text print(tree) # doctest: +ELLIPSIS from nltk.corpus import conll2002 for doc in conll2002.chunked_sents('ned.train')[27]: print(doc) from nltk.sem import relextract pairs = relextract.tree2semi_rel(tree) for s, tree in pairs[18:22]: print('("...%s", %s)' % (" ".join(s[-5:]), tree)) reldicts = relextract.semi_rel2reldict(pairs) for k, v in sorted(reldicts[0].items()): print(k, '=>', v) # doctest: +ELLIPSIS for r in reldicts[18:20]: print('=' * 20) print(r['subjtext']) print(r['filler']) print(r['objtext']) import re IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')
import nltk from nltk.corpus import conll2002 for documents in conll2002.chunked_sents('ned.train')[25]: print(documents)
# DO NOT MODIFY import pickle ner = pickle.load(open("best.pickle", "rb")) from nltk.corpus import conll2002 as conll # Usage 1: parse a list of sentences (with POS tags) tagzinnen = conll.tagged_sents("ned.train")[1000:1050] result = ner.parse_sents(tagzinnen) # Usage 2: self-evaluate (on chunked sentences) chunkzinnen = conll.chunked_sents("ned.testa")[1000:1500] print(ner.evaluate(chunkzinnen))
#IN = re.compile(r'.*\bin\b(?!\b.+ing)') IN = re.compile(r'.*\bin\b') #print IN.search('what is in the festival spring of') #print nltk.corpus.ieer.parsed_docs('NYT_19980315')[0].text for i, doc in enumerate(nltk.corpus.ieer.parsed_docs('NYT_19980315')): #print doc.text for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): print i, nltk.sem.relextract.show_raw_rtuple(rel) from nltk.corpus import conll2002 vnv = """ ( is/V| # 3rd sing present and was/V| # past forms of the verb zijn ('be') werd/V| # and also present wordt/V # past of worden ('become) ) .* # followed by anything van/Prep # followed by van ('of') """ VAN = re.compile(vnv, re.VERBOSE) for doc in conll2002.chunked_sents('ned.train'): for r in nltk.sem.extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN): #print nltk.sem.relextract.show_clause(r, relsym="VAN") print nltk.sem.relextract.show_raw_rtuple(r, lcon=True, rcon=True)
def sent2labels(sent): labels = [] for i in range(len(sent)): if type(sent[i]) != tuple: label = sent[i]._label labels.append(label) return labels def sent2tokens(sent): return [token for token, postag, label in sent] etr = conll2002.chunked_sents('esp.train') # In Spanish eta = conll2002.chunked_sents('esp.testa') # In Spanish etb = conll2002.chunked_sents('esp.testb') # In Spanish dtr = conll2002.chunked_sents('ned.train') # In Dutch dta = conll2002.chunked_sents('ned.testa') # In Dutch dtb = conll2002.chunked_sents('ned.testb') # In Dutch train_sents = etr test_sents = etb X_train = [sent2features(s) for s in train_sents] X_train = [item for sublist in X_train for item in sublist] # normalizing the values of x: for index in range(len(X_train[0])): mean = np.mean(np.array([row[index] for row in X_train]))