Exemplo n.º 1
0
def main(feature_set, algorithm="IIS", train_sample_size=0):
    """" The main method for using the NER-tagger.
    This method trains, pickles and evaluates the models, skipping the
    unpickling part for efficiency. This method is faster and easier than
    using EvaluateModels and BuildModels, but has less options.

    Use the flags -h or -help to get this help message.
    This flag will overrule any other flags.

    Keyword arguments:

    algorithm -- The name of the algorithm to use.
    Must be one of ["IIS", "GIS", "NaiveBayes"] (Default = IIS).
    The flag to set this can be -a or -alg or -algorithm in the command line

    train_sample_size -- The number of training samples to use as an integer.
    Must be between 0 and the length of the conll ned.train. (Default = full ned.train)
    To set this, use flag -tss or -train_sample_size in the command line

    test_all_features -- Boolean, whether to test all features after
    each other on alphabetical order.
    This argument can be useful when implementing multiple new features
    in between testing to visualize improvement.
    To set this, ust flag -taf or -test_all_features in the command line.
    This flag will overwrite feature_set if passed later in the command line.

    feature_set -- The list of features on which we are going to be training.
    Please note that every next feature calls all the previous features as well.
    For example, feature 3 also calls feature 2 and 1.
    Use the flag -feature or -f to only use a specific feature.
    This flag will overwrite test_all_features when passed later in the command line.
    """

    train_data = conll.chunked_sents("ned.train")

    # Resize the testing size if necessary
    if 0 < train_sample_size < len(train_data):
        train_data = conll.chunked_sents("ned.train")[:train_sample_size]

    for feature in feature_set:

        # Train model(s) and pickle them.
        model = Bm.train_model(feature=feature,
                               train_data=train_data,
                               alg=algorithm)

        # Evaluate the models
        Em.evaluate_model(model)
Exemplo n.º 2
0
def conllned(trace=1):
    """
    Find the copula+'van' relation ('of') in the Dutch tagged training corpus
    from CoNLL 2002.
    """

    from nltk.corpus import conll2002
        
    vnv = """
    (
    is/V|
    was/V|
    werd/V|
    wordt/V
    )
    .*
    van/Prep
    """
    VAN = re.compile(vnv, re.VERBOSE)
    
    print
    print "van(PER, ORG) -- raw rtuples with context:"
    print "=" * 45
    for doc in conll2002.chunked_sents('ned.train'):
        lcon = rcon = False
        if trace:
                lcon = rcon = True
        for rel in relextract('PER', 'ORG', doc, corpus='conll2002', pattern=VAN):
            print show_raw_rtuple(rel, lcon=lcon, rcon=rcon)
Exemplo n.º 3
0
def train_model(feature, train_data=conll.chunked_sents("ned.train"), alg="IIS", folder="pickles"):
    """"Train a NER-tagger model and pickle it afterwards. Returns the trained model.

    Keyword arguments:

    alg -- The name of the algorithm to use.
    Must be one of ["IIS", "GIS", "NaiveBayes"] (Default = IIS).
    The flag to set this can be -a , -alg or -algorithm in the command line

    folder -- Which folder to save the pickled model(s) in (Default = "pickles")
    """

    print()
    print("--------------------START TRAINING-----------------------")

    # Read all info of feature from the tuple
    feature_name = feature[0]
    feature_function = feature[1]

    # Train the model and inform the user on start time
    print("Training on", len(train_data), "samples, using",
          feature_name, " on algorithm", alg)
    start_time = dt.now()
    print("Training start time:", start_time.strftime('%d-%m-%Y %H:%M:%S.%f')[:-3])
    model = ConsecutiveNPChunker(feature_function,
                                 train_data, algorithm=alg)

    # Inform the user on the elapsed and end times
    end_time = dt.now()
    elapsed = end_time - start_time
    print("Training end time:", end_time.strftime('%d-%m-%Y %H:%M:%S.%f')[:-3], "(Elapsed:", str(elapsed)[:-3] + ")")

    pickle_model(model=model, folder=folder)

    return model
Exemplo n.º 4
0
def conllned(trace=1):
    """
    Find the copula+'van' relation ('of') in the Dutch tagged training corpus
    from CoNLL 2002.
    """

    from nltk.corpus import conll2002

    vnv = """
    (
    is/V|    # 3rd sing present and
    was/V|   # past forms of the verb zijn ('be')
    werd/V|  # and also present
    wordt/V  # past of worden ('become)
    )
    .*       # followed by anything
    van/Prep # followed by van ('of')
    """
    VAN = re.compile(vnv, re.VERBOSE)

    print()
    print("Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:")
    print("=" * 45)

    for doc in conll2002.chunked_sents('ned.train'):
        lcon = rcon = False
        if trace:
            lcon = rcon = True
        for rel in extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN):
            print(rtuple(rel, lcon=True, rcon=True))
def conllned(trace=1):
    """
    Find the copula+'van' relation ('of') in the Dutch tagged training corpus
    from CoNLL 2002.
    """

    from nltk.corpus import conll2002

    vnv = """
    (
    is/V|    # 3rd sing present and 
    was/V|   # past forms of the verb zijn ('be')
    werd/V|  # and also present 
    wordt/V  # past of worden ('become)
    )
    .*       # followed by anything
    van/Prep # followed by van ('of')
    """
    VAN = re.compile(vnv, re.VERBOSE)

    print
    print "Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:"
    print "=" * 45

    for doc in conll2002.chunked_sents('ned.train'):
        lcon = rcon = False
        if trace:
            lcon = rcon = True
        for rel in extract_rels('PER',
                                'ORG',
                                doc,
                                corpus='conll2002',
                                pattern=VAN,
                                window=10):
            print show_raw_rtuple(rel, lcon=True, rcon=True)
Exemplo n.º 6
0
def relation_extraction2():
  # needs POS as well as NE annotations (in Dutch)
  from nltk.corpus import conll2002
  vnv = """
(
is/V|       # 3rd sing present and
was/V|      # past forms of the verm zijn (be)
werd/V|     # and also present
wordt/V     # past of worden (become)
).*           # followed by anything
van/Prep      # followed by van (of)
  """
  VAN = re.compile(vnv, re.VERBOSE)
  for doc in conll2002.chunked_sents("ned.train"):
    for r in nltk.sem.extract_rels("PER", "ORG", doc,
        corpus="conll2002", pattern=VAN):
#      print nltk.sem.show_clause(r, relsym="VAN")
      print nltk.sem.show_raw_rtuple(r, lcon=True, rcon=True)
Exemplo n.º 7
0
def conllesp():
    from nltk.corpus import conll2002

    de = """
    .*
    (
    de/SP|
    del/SP
    )
    """
    DE = re.compile(de, re.VERBOSE)

    print()
    print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
    print("=" * 45)
    rels = [rel for doc in conll2002.chunked_sents('esp.train')
            for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)]
    for r in rels[:10]: print(clause(r, relsym='DE'))
    print()
Exemplo n.º 8
0
def conllesp():
    from nltk.corpus import conll2002

    de = """
    .*
    (
    de/SP|
    del/SP
    )
    """
    DE = re.compile(de, re.VERBOSE)

    print()
    print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
    print("=" * 45)
    rels = [rel for doc in conll2002.chunked_sents('esp.train')
            for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern=DE)]
    for r in rels[:10]: print(clause(r, relsym='DE'))
    print()
Exemplo n.º 9
0
def relationExtraction():
    print "page 284 7.6  Relation Extraction"
    import re
    IN = re.compile(r'.*\bin\b(?!\b.+ing)')
    for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
        for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN):
            print nltk.sem.show_raw_rtuple(rel) # failure on python 2.7

    from nltk.corpus import conll2002
    vnv = """ 
        (
        is/V|    # 3rd sing present and 
        was/V|   # past forms of the verb zijn ('be') 
        werd/V|  # and also present 
        wordt/V  # past of worden ('become') 
        ) 
        .*       # followed by anything 
        van/Prep # followed by van ('of') 
        """
    VAN = re.compile(vnv, re.VERBOSE)
    for doc in conll2002.chunked_sents('ned.train'):
        for r in nltk.sem.extract_rels('PER', 'ORG', doc,corpus='conll2002', pattern=VAN):
            print  nltk.sem.show_clause(r, relsym="VAN")
Exemplo n.º 10
0
def conllesp():
    from nltk.corpus import conll2002

    de = """
    .*
    (
    de/SP|
    del/SP
    )
    """
    DE = re.compile(de, re.VERBOSE)

    print()
    print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:")
    print("=" * 45)
    rels = [
        rel
        for doc in conll2002.chunked_sents("esp.train")
        for rel in extract_rels("ORG", "LOC", doc, corpus="conll2002", pattern=DE)
    ]
    for r in rels[:10]:
        print(show_clause(r, relsym="DE"))
    print()
Exemplo n.º 11
0
def evaluate_model(model, testdata=conll.chunked_sents("ned.testa")):
    """ Evaluate a given model on test data and print the results """

    print()
    print("-------------------START EVALUATING----------------------")

    # Inform the user when the evaluation has started
    start_time = dt.now()
    start_time_formatted = start_time.strftime('%d-%m-%Y %H:%M:%S.%f')[:-3]
    print("Evaluating on", len(testdata), "samples. Start time: ",
          start_time_formatted)

    # Evaluate the model and print the score
    score = model.evaluate(testdata)
    print(score)

    # Create the Evaluation-output.txt file if it does not exist
    if not os.path.exists("Evaluation-output.txt"):
        with open("Evaluation-output.txt", 'w') as file:
            file.write(
                "Datetime;Algorithm;Feature_set;Accuracy;Precision;Recall;F_Measure \n"
            )

    # Write the results to the file
    with open("Evaluation-output.txt", 'a') as file:
        file.write(start_time_formatted + ";" + str(model._algorithm) + ";" +
                   str(model.tagger._featuremap.__name__) + ";" +
                   str(score.accuracy()) + ";" + str(score.precision()) + ";" +
                   str(score.recall()) + ";" + str(score.f_measure()) + "\n")

    # Inform the user of the elapsed and time times
    end_time = dt.now()
    elapsed = end_time - start_time
    print("End time:",
          end_time.strftime('%d-%m-%Y %H:%M:%S.%f')[:-3], "(Elapsed:",
          str(elapsed)[:-3] + ")")
Exemplo n.º 12
0
mode = raw_input()
train_file, test_file = '', ''
if language == 'spanish':
	train_file = 'esp.train'
	if mode == 'dev':
		test_file = 'esp.testa'
	else:
		test_file = 'esp.testb'
elif language == 'dutch':
	train_file = 'ned.train'
	if mode == 'dev':
		test_file = 'ned.testa'
	else:
		test_file = 'ned.testb'
stemmer = SnowballStemmer(language)
chunked = [nltk.chunk.tree2conlltags(tree) for tree in conll2002.chunked_sents(fileids = train_file)]
print 'Generating training set'
train_set = []
for chunk in chunked:
	for i in range(len(chunk)):
		prev = ('', '', '')
		if i > 0:
			prev = chunk[i - 1]
		next = ('', '', '')
		if i < len(chunk) - 1:
			next = chunk[i + 1]
		data = chunk[i]
		feature = generate_feature(data, prev, next, i, len(chunk), stemmer)
		train_set.append((feature, data[-1]))
chunked = [nltk.chunk.tree2conlltags(tree) for tree in conll2002.chunked_sents(fileids = test_file)]
print 'Generating test set'
Exemplo n.º 13
0
import pickle
from nltk.corpus import conll2002 as conll
import custom_chunker

#
"""
Script that loads (unpickles) and evaluates models.
Evaluation includes identificaion of the model/feature set, precision, recall, and F-measure for each model.

Prints the evaluation, and exports it to Evaluation-output.txt, where notes can be added as well.
"""

ner = pickle.load(open("Bayestbigfulltest4", "rb"))

chunksent = conll.chunked_sents("ned.testa")  #[1000:1500]

metrics = ner.evaluate(chunksent)

#stats
guess = metrics.guessed()
NEinData = metrics.correct()
truePos = [v[1] for v in metrics._tp]
falsePos = metrics.incorrect()

#fetch stats about algorithm etc.
infofile = open("infofile", "r")
inhoud = infofile.read()
infofile.close()

guessed = "Chunks guessed: " + str(len(guess))
AmountinData = "Amount of NE's in data: " + str(len(NEinData))
Exemplo n.º 14
0
####关系抽取####
import re
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
    for rel in nltk.sem.extract_rels('ORG',
                                     'LOC',
                                     doc,
                                     corpus='ieer',
                                     pattern=IN):
        print(nltk.sem.relextract.rtuple(rel))

from nltk.corpus import conll2002
vnv = """
(
is/V| # 3rd sing present and
was/V| # past forms of the verb zijn ('be')
werd/V| # and also present
wordt/V # past of worden ('become')
)
.* # followed by anything
van/Prep # followed by van ('of')
"""
VAN = re.compile(vnv, re.VERBOSE)
for doc in conll2002.chunked_sents('ned.train'):
    for r in nltk.sem.extract_rels('PER',
                                   'ORG',
                                   doc,
                                   corpus='conll2002',
                                   pattern=VAN):
        print(nltk.sem.relextract.rtuple(r))
Exemplo n.º 15
0
# -- coding: utf-8 --
"""
Created on Fri Jun  7 11:27:05 2019

@author: sepke
"""
from nltk.corpus import conll2002 as conll
from custom_chunker import ConsecutiveNPChunker
import pickle
import features

tiny_sample = 500
# training = conll.chunked_sents("ned.train")  # Train with full dataset
training = conll.chunked_sents(
    "ned.train")  # SHORT DATASET: FOR DEMO/DEBUGGING ONLY!
testing = conll.chunked_sents("ned.testa")
simple_nl_NER = ConsecutiveNPChunker(features.simple_features_2, training,
                                     'GIS')

output = open("nl-GIS3.pickle", "wb")
pickle.dump(simple_nl_NER, output)
output.close()

#simple_nl_NER2 = ConsecutiveNPChunker(features.simple_features_1, training, 'GIS')
#print(simple_nl_NER2.evaluate(testing))

print(simple_nl_NER.evaluate(testing))
Exemplo n.º 16
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

from nltk.corpus import conll2000, conll2002
print(conll2000.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2000.chunked_sents()[:2]:
    print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(conll2002.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
for tree in conll2002.chunked_sents()[:2]:
    print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE


# SEMCOR
    
from nltk.corpus import semcor
print(semcor.words())
print(semcor.chunks())
print(semcor.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
print(semcor.chunk_sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
list(map(str, semcor.tagged_chunks(tag='both')[:3]))
[[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]]    

# IEER

from nltk.corpus import ieer
ieer.fileids() # doctest: +NORMALIZE_WHITESPACE
docs = ieer.parsed_docs('APW_19980314')
print(docs[0])
print(docs[0].docno)
print(docs[0].doctype)
print(docs[0].date_time)
from custom_chunker import ConsecutiveNPChunker
from features import features_simple_1
import pickle
from nltk.corpus import conll2002 as conll
import custom_chunker


tiny_sample = 150
# training = conll.chunked_sents("ned.train")  # Train with full dataset
training = conll.chunked_sents("ned.train")[:tiny_sample] # SHORT DATASET: FOR DEMO/DEBUGGING ONLY!
testing = conll.chunked_sents("ned.testa")

simple_nl_NER = ConsecutiveNPChunker(features_simple_1, training)
conll.chunked_sents("ned.train")[0]
conll.chunked_sents("ned.testa")[0]

print(custom_chunker.alg,custom_chunker.featurelength,len(testing),len(training))
if custom_chunker.alg == "NaiveBayes":
    algo = "NaiveBayes Algorithm"
else:
    algo = ""
    algo += "MaxEnt " + custom_chunker.alg + " Algorithm"
infofile = open("infofile", "w")
filestring = algo + "\n" + str(custom_chunker.featurelength) + " features, " + str(len(testing)) + " train sentences, " + str(len(training)) + " testing"
infofile.write(filestring )
infofile.close()

print(simple_nl_NER.evaluate(testing))
simple_nl_NER.show_most_informative_features(20)
#pickling: (don't forget to change the name if you do a diff test!!! (also in model_test))
output = open("best.pickle", "wb")
Exemplo n.º 18
0
from nltk.corpus import conll2002

# Language-independent named entity recognition
print(conll2002.chunked_sents()[0])

from nltk.corpus import ieer

# XML documents without POS tags
print(ieer.raw('APW_19980424'))


Exemplo n.º 19
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

# NAMED ENTITIES

from nltk.corpus import ieer
docs = ieer.parsed_docs('NYT_19980315')
tree = docs[1].text
print(tree)  # doctest: +ELLIPSIS

from nltk.corpus import conll2002
for doc in conll2002.chunked_sents('ned.train')[27]:
    print(doc)

from nltk.sem import relextract
pairs = relextract.tree2semi_rel(tree)
for s, tree in pairs[18:22]:
    print('("...%s", %s)' % (" ".join(s[-5:]), tree))

reldicts = relextract.semi_rel2reldict(pairs)
for k, v in sorted(reldicts[0].items()):
    print(k, '=>', v)  # doctest: +ELLIPSIS

for r in reldicts[18:20]:
    print('=' * 20)
    print(r['subjtext'])
    print(r['filler'])
    print(r['objtext'])

import re
IN = re.compile(r'.*\bin\b(?!\b.+ing\b)')
Exemplo n.º 20
0
import nltk
from nltk.corpus import conll2002
for documents in conll2002.chunked_sents('ned.train')[25]:
    print(documents)
Exemplo n.º 21
0
# DO NOT MODIFY

import pickle
ner = pickle.load(open("best.pickle", "rb"))

from nltk.corpus import conll2002 as conll

# Usage 1: parse a list of sentences (with POS tags)
tagzinnen = conll.tagged_sents("ned.train")[1000:1050]
result = ner.parse_sents(tagzinnen)

# Usage 2: self-evaluate (on chunked sentences)
chunkzinnen = conll.chunked_sents("ned.testa")[1000:1500]
print(ner.evaluate(chunkzinnen))
Exemplo n.º 22
0
#IN = re.compile(r'.*\bin\b(?!\b.+ing)')

IN = re.compile(r'.*\bin\b')
#print IN.search('what is in the festival spring of')
#print nltk.corpus.ieer.parsed_docs('NYT_19980315')[0].text
for i, doc in enumerate(nltk.corpus.ieer.parsed_docs('NYT_19980315')):
    #print doc.text
    for rel in nltk.sem.extract_rels('ORG', 'LOC', doc,
                                     corpus='ieer', pattern=IN):
        print i, nltk.sem.relextract.show_raw_rtuple(rel)


from nltk.corpus import conll2002
vnv = """
(
is/V|           # 3rd sing present and
was/V|          # past forms of the verb zijn ('be')
werd/V|         # and also present
wordt/V         # past of worden ('become)
)
.*              # followed by anything
van/Prep        # followed by van ('of')
"""
VAN = re.compile(vnv, re.VERBOSE)
for doc in conll2002.chunked_sents('ned.train'):
    for r in nltk.sem.extract_rels('PER', 'ORG', doc,
                                   corpus='conll2002', pattern=VAN):
        #print nltk.sem.relextract.show_clause(r, relsym="VAN")
        print nltk.sem.relextract.show_raw_rtuple(r, lcon=True, rcon=True)


def sent2labels(sent):
    labels = []
    for i in range(len(sent)):
        if type(sent[i]) != tuple:
            label = sent[i]._label
            labels.append(label)
    return labels


def sent2tokens(sent):
    return [token for token, postag, label in sent]


etr = conll2002.chunked_sents('esp.train')  # In Spanish
eta = conll2002.chunked_sents('esp.testa')  # In Spanish
etb = conll2002.chunked_sents('esp.testb')  # In Spanish

dtr = conll2002.chunked_sents('ned.train')  # In Dutch
dta = conll2002.chunked_sents('ned.testa')  # In Dutch
dtb = conll2002.chunked_sents('ned.testb')  # In Dutch

train_sents = etr
test_sents = etb

X_train = [sent2features(s) for s in train_sents]
X_train = [item for sublist in X_train for item in sublist]
# normalizing the values of x:
for index in range(len(X_train[0])):
    mean = np.mean(np.array([row[index] for row in X_train]))