def __init__(self,mwe=True): self.mwe = mwe # Train tagger if it's used for the first time. try: loadtagger('conll_unigram.tagger').tag(['estoy']) loadtagger('conll_bigram.tagger').tag(['estoy']) except IOError: print "*** First-time use of conll tagger ***" print "Training tagger ..." from nltk.corpus import conll2002 as conll conll_sents = conll.tagged_sents() traintag('conll',conll_sents) # Trains the tagger with no MWE. conll_nomwe = unchunk(conll.tagged_sents()) tagged_conll_nomwe = batch_pos_tag(conll_nomwe) traintag('conll_nomwe',tagged_conll_nomwe) print # Load tagger. if self.mwe == True: self.uni = loadtagger('conll_unigram.tagger') self.bi = loadtagger('conll_bigram.tagger') elif self.mwe == False: self.uni = loadtagger('conll_nomwe_unigram.tagger') self.bi = loadtagger('conll_nomwe_bigram.tagger')
def train_tagger(): print "--- Train Tagger ---" #1. prepare data train_sentences = conll2002.tagged_sents('esp.train') test_sentences = conll2002.tagged_sents('esp.testa') #2. train brill tagger tagger = train(train_sentences) #3. test brill tagger print "- test score: %0.4f" % tagger.evaluate(test_sentences) #4. save tagger into a file print "- saving tagger" pickle.dump(tagger, open("./files/pos_tagger.p", "wb")) print "-- DONE."
def __init__(self): if not os.path.isfile('tagger.pickle') : print "Training tagger..." # train_sents = alpino.tagged_sents() train_sents = conll2002.tagged_sents('ned.train') # train_sents = conll2002.chunked_sents('ned.train') word_patterns = [ (r'\d+\.\d+\w?', 'Ref'), (r'\d+\:\d+\w?', 'Ref'), (r'\d+\w', 'Ref'), (r'\d+/\d+/eg', 'Ref'), (r'^(18|19|20)\d\d$', 'Year'), (r'(de|het|een)', 'Art'), (r'(en|of)', 'EnOf'), (r'^\d+', 'Index'), (r'^\w+\.$', 'Index'), (r'^\d+(\D|\S|\W)(\.)?$', 'DegIndex'), (r'^\w$', 'Ref'), (r'^;$', 'Punc'), (r'^[a-zA-Z]+\d+$','Ref'), (r'^\w\w+$', 'N') ] raubt_tagger = self.backoff_tagger(train_sents, [nltk.tag.AffixTagger, nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger], backoff=RegexpTagger(word_patterns)) templates = [ brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)), brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)), brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)), brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1)) ] trainer = brill.FastBrillTaggerTrainer(raubt_tagger, templates) braubt_tagger = trainer.train(train_sents, max_rules=100, min_score=3) self.trained_tagger = braubt_tagger pickle.dump(self.trained_tagger, open('tagger.pickle','w')) print "Dumped tagger to file" else : self.trained_tagger = pickle.load(open('tagger.pickle', 'r')) print "Loaded tagger from file"
def select_sents(x): return { 'brown_universal': brown.tagged_sents(tagset='universal'), # Accuracy: 95.12% 'brown': brown.tagged_sents(), # Accuracy: 93.66% 'conll2000_universal': conll2000.tagged_sents(tagset='universal'), # Accuracy: 95.63% 'conll2000': conll2000.tagged_sents(), # Accuracy: 94.94% 'conll2002': conll2002.tagged_sents(), # Accuracy: 91.53% 'alpino': alpino.tagged_sents(), # Accuracy: 88.79% 'dependency_treebank': dependency_treebank.tagged_sents(), # Accuracy: 90.79% 'treebank': treebank.tagged_sents(), # Accuracy: 91.44% 'indian': indian.tagged_sents(), # Accuracy: 64.41% 'else': [] # in case of an unavailable corpus }.get(x, 'else')
def get_tagged_sentences(self): return conll2002.tagged_sents(fileids=['esp.testa', 'esp.testb'])
def tagged_sents(self): if not hasattr(self, '_tagged_sents'): setattr(self, '_tagged_sents', conll2002.tagged_sents()) return getattr(self, '_tagged_sents')
nltk.data.find('corpora/conll2002') except: nltk.download('conll2002') try: nltk.data.find('taggers/averaged_perceptron_tagger') except: nltk.download('averaged_perceptron_tagger') from nltk.corpus import wordnet as wn from nltk.corpus import treebank, conll2000, brown, conll2002 from nltk import DefaultTagger, UnigramTagger, BigramTagger wordnet_lemmatizer = nltk.stem.WordNetLemmatizer() # The code below trains bigram part of speech tagger from various datasets. train_sents = treebank.tagged_sents() + brown.tagged_sents() + conll2000.tagged_sents() + conll2002.tagged_sents() edited_train = [] for sent in train_sents: edited_train.append([(word.lower(),tag) for (word,tag) in sent]) t0 = DefaultTagger(None) et1 = UnigramTagger(edited_train, backoff = t0) et2 = BigramTagger(edited_train, backoff = et1) # The function below converts bigram pos to wordnet pos for lemmatization def penn_to_wn(tag): nltk_wn_pos = {'J':wn.ADJ,'V':wn.VERB,'N':wn.NOUN,'R':wn.ADV} try: return nltk_wn_pos[tag[0]] except: return None
# DO NOT MODIFY import pickle ner = pickle.load(open("best.pickle", "rb")) from nltk.corpus import conll2002 as conll # Usage 1: parse a list of sentences (with POS tags) tagzinnen = conll.tagged_sents("ned.train")[1000:1050] result = ner.parse_sents(tagzinnen) # Usage 2: self-evaluate (on chunked sentences) chunkzinnen = conll.chunked_sents("ned.testa")[1000:1500] print(ner.evaluate(chunkzinnen))
def get_tagged_sentences(self): return conll2002.tagged_sents(fileids=FILEIDS)
print text print tags print tag return True i += 1 return False DIFF_THRESHOLD = 0.75 conn = psycopg2.connect("dbname=%s user=%s password=%s" % (postgres_db, postgres_user, postgres_pass)) cur = conn.cursor() sents = conll2002.tagged_sents() hmm_tagger = HiddenMarkovModelTagger.train(sents) query_pool = [] sparql_query = '''select ?label (?menPopulation + ?womenPopulation) as ?sum where {?s a <http://dbpedia.org/ontology/Municipality> . ?s rdfs:label ?label . ?s <http://opendata.aragon.es/def/Aragopedia#menPopulation> ?menPopulation . ?s <http://opendata.aragon.es/def/Aragopedia#menPopulation> ?womenPopulation . } ORDER BY DESC(?sum) LIMIT 400 ''' print sparql_query payload = {'query': sparql_query, 'format': 'json'} r = requests.get('http://opendata.aragon.es/sparql', params=payload) query = ''
from time import sleep from celery.signals import celeryd_init from multiprocessing import Pool DIFF_THRESHOLD = 0.75 #twitter_stream = None app = Celery('tasks', broker='redis://localhost:6379/0') conn = psycopg2.connect("dbname=%s user=%s password=%s" % (postgres_db, postgres_user, postgres_pass)) cur = conn.cursor() # redis = redis.StrictRedis(host='localhost', port=6379, db=0) sents = conll2002.tagged_sents() hmm_tagger = HiddenMarkovModelTagger.train(sents) print 'Tagger ready' def analyze(text, track_list): tokens = word_tokenize(text) tags = hmm_tagger.tag(tokens) for tag in tags: if tag[0] in track_list: if tag[1].startswith('N') and len(tag[1]) <= 2: print text print tag return True break return False
def corpus_reader(corpus_path,tag='bio'): """ corpus relative path (str) -> list of iob sents """ if tag=='bio': # training dataset return reader.iob_sents(os.path.abspath(corpus_path)) if tag=='pos': # test dataset return reader.tagged_sents(os.path.abspath(corpus_path))
def corpus_reader(corpus_path, tag='bio'): """ corpus relative path (str) -> list of iob sents """ if tag == 'bio': # training dataset return reader.iob_sents(os.path.abspath(corpus_path)) if tag == 'pos': # test dataset return reader.tagged_sents(os.path.abspath(corpus_path))