示例#1
0
文件: tagger.py 项目: dasolma/pyNLU
	def __init__(self,mwe=True):
		self.mwe = mwe
		# Train tagger if it's used for the first time.
		try:
			loadtagger('conll_unigram.tagger').tag(['estoy'])
			loadtagger('conll_bigram.tagger').tag(['estoy'])
		except IOError:
			print "*** First-time use of conll tagger ***"
			print "Training tagger ..."
			from nltk.corpus import conll2002 as conll

			conll_sents = conll.tagged_sents()
			traintag('conll',conll_sents)
			# Trains the tagger with no MWE.
			conll_nomwe = unchunk(conll.tagged_sents())
			tagged_conll_nomwe = batch_pos_tag(conll_nomwe)
			traintag('conll_nomwe',tagged_conll_nomwe)
			print
		# Load tagger.
		if self.mwe == True:
			self.uni = loadtagger('conll_unigram.tagger')
			self.bi = loadtagger('conll_bigram.tagger')
		elif self.mwe == False:
			self.uni = loadtagger('conll_nomwe_unigram.tagger')
			self.bi = loadtagger('conll_nomwe_bigram.tagger')
示例#2
0
def train_tagger():
    print "--- Train Tagger ---"
    #1. prepare data
    train_sentences = conll2002.tagged_sents('esp.train')
    test_sentences = conll2002.tagged_sents('esp.testa')
    #2. train brill tagger
    tagger = train(train_sentences)
    #3. test brill tagger
    print "- test score: %0.4f" % tagger.evaluate(test_sentences)
    #4. save tagger into a file
    print "- saving tagger"
    pickle.dump(tagger, open("./files/pos_tagger.p", "wb"))

    print "-- DONE."
示例#3
0
    def __init__(self):
        if not os.path.isfile('tagger.pickle') :
            print "Training tagger..."
#            train_sents = alpino.tagged_sents()
            train_sents = conll2002.tagged_sents('ned.train')
#            train_sents = conll2002.chunked_sents('ned.train')
            
            word_patterns = [ (r'\d+\.\d+\w?', 'Ref'),
                             (r'\d+\:\d+\w?', 'Ref'),
                             (r'\d+\w', 'Ref'),  
                             (r'\d+/\d+/eg', 'Ref'),
                             (r'^(18|19|20)\d\d$', 'Year'),
                             (r'(de|het|een)', 'Art'),
                             (r'(en|of)', 'EnOf'),
                             (r'^\d+', 'Index'),
                             (r'^\w+\.$', 'Index'),
                             (r'^\d+(\D|\S|\W)(\.)?$', 'DegIndex'),
                             (r'^\w$', 'Ref'),
                             (r'^;$', 'Punc'),
                             (r'^[a-zA-Z]+\d+$','Ref'),
                             (r'^\w\w+$', 'N') ]
            
            raubt_tagger = self.backoff_tagger(train_sents, [nltk.tag.AffixTagger,
            nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger],
            backoff=RegexpTagger(word_patterns))
         
            templates = [
                brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)),
                brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)),
                brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)),
                brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)),
                brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)),
                brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)),
                brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)),
                brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)),
                brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)),
                brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1))
            ]
             
            trainer = brill.FastBrillTaggerTrainer(raubt_tagger, templates)
            braubt_tagger = trainer.train(train_sents, max_rules=100, min_score=3)
            
            self.trained_tagger = braubt_tagger
            
            pickle.dump(self.trained_tagger, open('tagger.pickle','w'))
            print "Dumped tagger to file"
        else :
            self.trained_tagger = pickle.load(open('tagger.pickle', 'r'))
            print "Loaded tagger from file"
示例#4
0
def select_sents(x):
    return {
        'brown_universal':
        brown.tagged_sents(tagset='universal'),  # Accuracy: 95.12%
        'brown': brown.tagged_sents(),  # Accuracy: 93.66%
        'conll2000_universal':
        conll2000.tagged_sents(tagset='universal'),  # Accuracy: 95.63%
        'conll2000': conll2000.tagged_sents(),  # Accuracy: 94.94%
        'conll2002': conll2002.tagged_sents(),  # Accuracy: 91.53%
        'alpino': alpino.tagged_sents(),  # Accuracy: 88.79%
        'dependency_treebank':
        dependency_treebank.tagged_sents(),  # Accuracy: 90.79%
        'treebank': treebank.tagged_sents(),  # Accuracy: 91.44%
        'indian': indian.tagged_sents(),  # Accuracy: 64.41%
        'else': []  # in case of an unavailable corpus
    }.get(x, 'else')
示例#5
0
 def get_tagged_sentences(self):
     return conll2002.tagged_sents(fileids=['esp.testa', 'esp.testb'])
示例#6
0
 def tagged_sents(self):
     if not hasattr(self, '_tagged_sents'):
         setattr(self, '_tagged_sents', conll2002.tagged_sents())
     return getattr(self, '_tagged_sents')
示例#7
0
    nltk.data.find('corpora/conll2002')
except:
    nltk.download('conll2002')
try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except:
    nltk.download('averaged_perceptron_tagger')

from nltk.corpus import wordnet as wn
from nltk.corpus import treebank, conll2000, brown, conll2002
from nltk import DefaultTagger, UnigramTagger, BigramTagger

wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

# The code below trains bigram part of speech tagger from various datasets.
train_sents = treebank.tagged_sents() + brown.tagged_sents() + conll2000.tagged_sents() + conll2002.tagged_sents()
edited_train = []
for sent in train_sents:
    edited_train.append([(word.lower(),tag) for (word,tag) in sent])
t0 = DefaultTagger(None)
et1 = UnigramTagger(edited_train, backoff = t0)
et2 = BigramTagger(edited_train, backoff = et1)

# The function below converts bigram pos to wordnet pos for lemmatization
def penn_to_wn(tag):
    nltk_wn_pos = {'J':wn.ADJ,'V':wn.VERB,'N':wn.NOUN,'R':wn.ADV}
    try:
        return nltk_wn_pos[tag[0]]
    except:
        return None
示例#8
0
# DO NOT MODIFY

import pickle
ner = pickle.load(open("best.pickle", "rb"))

from nltk.corpus import conll2002 as conll

# Usage 1: parse a list of sentences (with POS tags)
tagzinnen = conll.tagged_sents("ned.train")[1000:1050]
result = ner.parse_sents(tagzinnen)

# Usage 2: self-evaluate (on chunked sentences)
chunkzinnen = conll.chunked_sents("ned.testa")[1000:1500]
print(ner.evaluate(chunkzinnen))
示例#9
0
 def get_tagged_sentences(self):
     return conll2002.tagged_sents(fileids=FILEIDS)
示例#10
0
                    print text
                    print tags
                    print tag
                    return True

        i += 1
    return False


DIFF_THRESHOLD = 0.75

conn = psycopg2.connect("dbname=%s user=%s password=%s" %
                        (postgres_db, postgres_user, postgres_pass))
cur = conn.cursor()

sents = conll2002.tagged_sents()
hmm_tagger = HiddenMarkovModelTagger.train(sents)

query_pool = []

sparql_query = '''select ?label (?menPopulation + ?womenPopulation) as ?sum where {?s a <http://dbpedia.org/ontology/Municipality> .
?s rdfs:label ?label .
?s <http://opendata.aragon.es/def/Aragopedia#menPopulation> ?menPopulation .
?s <http://opendata.aragon.es/def/Aragopedia#menPopulation> ?womenPopulation .
} ORDER BY DESC(?sum) LIMIT 400
'''
print sparql_query
payload = {'query': sparql_query, 'format': 'json'}
r = requests.get('http://opendata.aragon.es/sparql', params=payload)

query = ''
示例#11
0
文件: tasks.py 项目: alabarga/MORElab
from time import sleep
from celery.signals import celeryd_init
from multiprocessing import Pool

DIFF_THRESHOLD = 0.75
#twitter_stream = None
app = Celery('tasks', broker='redis://localhost:6379/0')



conn = psycopg2.connect("dbname=%s user=%s password=%s" % (postgres_db, postgres_user, postgres_pass))
cur = conn.cursor()

# redis = redis.StrictRedis(host='localhost', port=6379, db=0)

sents = conll2002.tagged_sents()
hmm_tagger = HiddenMarkovModelTagger.train(sents)

print 'Tagger ready'

def analyze(text, track_list):
    tokens = word_tokenize(text)
    tags = hmm_tagger.tag(tokens)
    for tag in tags:
        if tag[0] in track_list:
            if tag[1].startswith('N') and len(tag[1]) <= 2:
                print text
                print tag
                return True
                break
    return False
示例#12
0
def corpus_reader(corpus_path,tag='bio'):
    """ corpus relative path (str) -> list of iob sents """
    if tag=='bio':  # training dataset
        return reader.iob_sents(os.path.abspath(corpus_path))
    if tag=='pos':  # test dataset
        return reader.tagged_sents(os.path.abspath(corpus_path))
示例#13
0
def corpus_reader(corpus_path, tag='bio'):
    """ corpus relative path (str) -> list of iob sents """
    if tag == 'bio':  # training dataset
        return reader.iob_sents(os.path.abspath(corpus_path))
    if tag == 'pos':  # test dataset
        return reader.tagged_sents(os.path.abspath(corpus_path))