def parse_sections(sections):
    for section_name, lines in sections.iteritems():
        backup_tagger = nltk.data.load(
            'taggers/maxent_treebank_pos_tagger/english.pickle')
        medicines = MedicineTagger("drugs.txt", backup_tagger)
        tagger = RegexpTagger(tokens, medicines)

        sents = sent_tokenize(" ".join(lines))
        sents = [tagger.tag(nltk.word_tokenize(s)) for s in sents]
        #prev = []
        #w = word_tokenize(s)
        #for i in range(0,len(w)):
        #    prev.append(tagger.choose_tag(w, i, prev))
        #print ",".join([j or "" for j in prev])

        p = nltk.RegexpParser(parse_rules)
        for i in range(0, len(sents)):
            #print sents[i]
            if len(sents[i]) == 0:
                continue
            result = p.parse(sents[i])
            if (len(sys.argv) > 2 and sys.argv[2] == 'debug'):
                print result
            for child in result:
                # The only trees will be the ones we created.
                #if isinstance(child, nltk.tree.Tree):
                anything_useful(section_name, child)
Exemplo n.º 2
0
def parse_sections(sections):
    for section_name, lines in sections.iteritems():
        backup_tagger = nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle')
        medicines = MedicineTagger("drugs.txt", backup_tagger)
        tagger = RegexpTagger(tokens,medicines)

        sents = sent_tokenize(" ".join(lines))
        sents = [ tagger.tag(nltk.word_tokenize(s)) for s in sents ]
        #prev = []
        #w = word_tokenize(s)
        #for i in range(0,len(w)):
        #    prev.append(tagger.choose_tag(w, i, prev))
        #print ",".join([j or "" for j in prev])

        p = nltk.RegexpParser(parse_rules)
        for i in range(0,len(sents)):
            #print sents[i]
            if len(sents[i]) == 0:
                continue
            result = p.parse(sents[i])
            if (len(sys.argv) > 2 and sys.argv[2] == 'debug'):
                print result
            for child in result:
                # The only trees will be the ones we created.
                #if isinstance(child, nltk.tree.Tree):
                anything_useful(section_name, child)
Exemplo n.º 3
0
    def __init__(self, regexps=None, backoff=None):
        """Setup for RegexpLemmatizer()

        :param regexps: List of tuples of form (PATTERN, REPLACEMENT)
        :param backoff: Next lemmatizer in backoff chain.
        """
        SequentialBackoffLemmatizer.__init__(self, backoff)
        RegexpTagger.__init__(self, regexps, backoff)
        self._regexs = regexps
Exemplo n.º 4
0
    def __init__(self, regexps=None, backoff=None):
        """Setup for RegexpLemmatizer()

        :param regexps: List of tuples of form (PATTERN, REPLACEMENT)
        :param backoff: Next lemmatizer in backoff chain.
        """
        SequentialBackoffLemmatizer.__init__(self, backoff)
        RegexpTagger.__init__(self, regexps, backoff)
        self._regexs = regexps
Exemplo n.º 5
0
    def __init__(self, regexps=None, backoff=None):
        """Setup for RegexpLemmatizer()

        :param regexps: List of tuples of form (PATTERN, REPLACEMENT)
        :param backoff: Next lemmatizer in backoff chain.
        """
        SequentialBackoffLemmatizer.__init__(self, backoff)
        RegexpTagger.__init__(self, regexps, backoff)
        self._check = re.compile('|'.join('(?:%s)' % r[0] for r in regexps))
        self._regexs = [(re.compile(regexp), pattern,) for regexp, pattern in regexps]
Exemplo n.º 6
0
 def __init__(self: object, regexps=None, source=None, backoff=None, verbose: bool = False):
     """Setup for RegexpLemmatizer()
     :type regexps: list
     :param regexps: List of tuples of form (PATTERN, REPLACEMENT)
     :param backoff: Next lemmatizer in backoff chain.
     """
     SequentialBackoffLemmatizer.__init__(self, backoff=None, verbose=verbose)
     RegexpTagger.__init__(self, regexps, backoff)
     self._regexs = regexps
     self.source = source
Exemplo n.º 7
0
    def __init__(self, regexps=None, backoff=None):
        """Setup for RegexpLemmatizer()

        :param regexps: List of tuples of form (PATTERN, REPLACEMENT)
        :param backoff: Next lemmatizer in backoff chain.
        """
        SequentialBackoffLemmatizer.__init__(self, backoff)
        RegexpTagger.__init__(self, regexps, backoff)
        self._check = re.compile('|'.join('(?:%s)' % r[0] for r in regexps))
        self._regexs = [(re.compile(regexp), pattern,) for regexp, pattern in regexps]
Exemplo n.º 8
0
 def __init__(self: object, regexps=None, backoff=None, source: str = None, verbose: bool = False):
     """Setup for RegexpLemmatizer()
     :param regexps: List of tuples of form (PATTERN, REPLACEMENT)
     :param backoff: Next lemmatizer in backoff chain
     :param source: String for labelling lemmatizer in repr; used by verbose mode
     :param verbose: Flag to include which lemmatizer assigned in a given tag in the return tuple
     """
     SequentialEnsembleLemmatizer.__init__(self, backoff=None, verbose=verbose)
     RegexpTagger.__init__(self, regexps, backoff)
     self._regexs = regexps
     self.source = source
Exemplo n.º 9
0
    def tag(self, sent, tagregex=True, deftag='XX', verbose=False):
        kalimat = sent.encode('utf-8')

        text = self.regexTokenizer(kalimat.lower().strip())

        ## :> --___<<IMPORTANT>>___--
        ##      Untuk beberapa hal default tagger harus dibiarkan 'XX'
        ##      dengan tujuan identifikasi Entitas
        backoff_tagger = DefaultTagger(deftag)

        if tagregex:
           regexp_tagger = RegexpTagger(patterns.regex_patterns,backoff=backoff_tagger)
           unigram_tagger = UnigramTagger(self.reader_train, backoff=regexp_tagger)
        else:
           unigram_tagger = UnigramTagger(self.reader_train, backoff=backoff_tagger)
           
        bigram_tagger = BigramTagger(self.reader_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(self.reader_train, backoff=bigram_tagger)
        
        """
        # Menggunakan dataset pan localization bahasa indonesia "UI-1M-tagged.txt"
        # kombinasi proses tagging diatas menghasilkan tingkat akurasi:
        #      dengan regextagger: 77%
        #      tanpa regextagger : > 90%
        """
        if verbose:
           # Semakin besar dokumen, semakin lama proses perhitungan akurasi
           # disarankan hanya untuk testing
           print ("Calculating Tagger Accuracy...")
           self.tagAccuracy = trigram_tagger.evaluate(self.test_sent)
           print ("Accuracy is: %s" % (self.tagAccuracy))
        
        return trigram_tagger.tag(text)
Exemplo n.º 10
0
train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)]
test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):]

unigram_tagger = UnigramTagger(train_data, backoff=default_tagger)
print(unigram_tagger.evaluate(test_data))
# 0.835841722316356

bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger)
print(bigram_tagger.evaluate(test_data))
# 0.8454101465164956

trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger)
print(trigram_tagger.evaluate(test_data))
# 0.8427190272102063

regexp_tagger = RegexpTagger(
    [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
    ( r'(The|the|A|a|An|an)$', 'AT'), # articles
    ( r'.*able$', 'JJ'), # adjectives
    ( r'.*ness$', 'NN'), # nouns formed from adj
    ( r'.*ly$', 'RB'), # adverbs
    ( r'.*s$', 'NNS'), # plural nouns
    ( r'.*ing$', 'VBG'), # gerunds
    (r'.*ed$', 'VBD'), # past tense verbs
    (r'.*', 'NN') # nouns (default)
    ])

print(regexp_tagger.evaluate(test_data))
# 0.31306687929831556
>>>unigram_tagger = UnigramTagger(train_data,backoff=default_tagger)
>>>print unigram_tagger.evaluate(test_data)
>>>bigram_tagger= BigramTagger(train_data, backoff=unigram_tagger)
>>>print bigram_tagger.evaluate(test_data)
>>>trigram_tagger=TrigramTagger(train_data,backoff=bigram_tagger)
>>>print trigram_tagger.evaluate(test_data)

# Regex tagger 

>>>from nltk.tag.sequential import RegexpTagger
>>>regexp_tagger = RegexpTagger(
         [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
          ( r'(The|the|A|a|An|an)$', 'AT'),   # articles
          ( r'.*able$', 'JJ'),                # adjectives
          ( r'.*ness$', 'NN'),         # nouns formed from adj
          ( r'.*ly$', 'RB'),           # adverbs
          ( r'.*s$', 'NNS'),           # plural nouns
          ( r'.*ing$', 'VBG'),         # gerunds
          (r'.*ed$', 'VBD'),           # past tense verbs
          (r'.*', 'NN')                # nouns (default)
          ])
>>>print regexp_tagger.evaluate(test_data)



# NER tagger 
>>>import nltk
>>>from nltk import ne_chunk
>>>from nltk import word_tokenize
>>>sent = "Mark is studying at Stanford University in California"
>>>print(ne_chunk(nltk.pos_tag(word_tokenize(sent)), binary=False))
Exemplo n.º 12
0
    def train(self, templates=None, verbose=True):
        """Train a new Brill tagger."""
        if templates is None:
            templates = brill.nltkdemo18()

        random.seed(len(self.tagged_data_list))
        random.shuffle(self.tagged_data_list)
        cutoff = int(self.dev_size * self.train_size)

        training_data = self.tagged_data_list[:cutoff]
        test_data = self.tagged_data_list[cutoff:self.dev_size]

        # very simple regular expression tagger
        regex_tagger = RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'PUNCT'),
                                     (r'.*', 'N')])
        if verbose == True:
            print "Regular expression tagger accuracy:\n{}\n".format(
                regex_tagger.evaluate(test_data))

        # unigram tagger
        unigram_tagger = UnigramTagger(train=training_data,
                                       backoff=regex_tagger)
        if verbose == True:
            print "Unigram tagger accuracy:\n{}\n".format(
                unigram_tagger.evaluate(test_data))

        # bigram tagger
        bigram_tagger = BigramTagger(train=training_data,
                                     backoff=unigram_tagger)
        if verbose == True:
            print "Bigram tagger accuracy:\n{}\n".format(
                bigram_tagger.evaluate(test_data))

        # trigram tagger
        trigram_tagger = TrigramTagger(train=training_data,
                                       backoff=bigram_tagger)
        if verbose == True:
            print "Trigram tagger accuracy:\n{}\n".format(
                trigram_tagger.evaluate(test_data))

        # first iteration
        trainer = BrillTaggerTrainer(initial_tagger=trigram_tagger,
                                     templates=templates)
        brill_tagger = trainer.train(train_sents=training_data,
                                     max_rules=self.max_rules,
                                     min_score=self.min_score)
        if verbose == True:
            print "Initial Brill tagger accuracy:\n{}\n".format(
                brill_tagger.evaluate(test_data))

        # folding
        for i in range(0, self.num_groups):
            # random splitting
            random.seed(len(self.tagged_data_list))
            random.shuffle(self.tagged_data_list)
            cutoff = int(self.dev_size * self.train_size)

            training_data = self.tagged_data_list[:cutoff]
            test_data = self.tagged_data_list[cutoff:self.dev_size]

            # note that .train method returns a BrillTagger() object
            brill_tagger = trainer.train(train_sents=training_data,
                                         max_rules=self.max_rules,
                                         min_score=self.min_score)

            if verbose == True:
                print "Brill tagger accuracy, fold {}:\n{}\n".format(
                    i + 1, brill_tagger.evaluate(test_data))
Exemplo n.º 13
0
    def train(self, templates=None, verbose=True):
        """Train a new Brill tagger."""
        if templates is None:
            templates = brill.nltkdemo18()

        random.seed(len(self.tagged_data_list))
        random.shuffle(self.tagged_data_list)
        cutoff = int(self.dev_size * self.train_size)

        training_data = self.tagged_data_list[:cutoff]
        test_data = self.tagged_data_list[cutoff:self.dev_size]

        # very simple regular expression tagger
        regex_tagger = RegexpTagger([
            (r'^-?[0-9]+(.[0-9]+)?$', 'PUNCT'),
            (r'.*', 'N')
            ])
        if verbose == True:
            print "Regular expression tagger accuracy:\n{}\n".format(
                    regex_tagger.evaluate(test_data))

        # unigram tagger
        unigram_tagger = UnigramTagger(train=training_data,
                                       backoff=regex_tagger)
        if verbose == True:
            print "Unigram tagger accuracy:\n{}\n".format(
                    unigram_tagger.evaluate(test_data))

        # bigram tagger
        bigram_tagger = BigramTagger(train=training_data,
                                     backoff=unigram_tagger)
        if verbose == True:
            print "Bigram tagger accuracy:\n{}\n".format(
                    bigram_tagger.evaluate(test_data))

        # trigram tagger
        trigram_tagger = TrigramTagger(train=training_data,
                                       backoff=bigram_tagger)
        if verbose == True:
            print "Trigram tagger accuracy:\n{}\n".format(
                    trigram_tagger.evaluate(test_data))

        # first iteration
        trainer = BrillTaggerTrainer(initial_tagger=trigram_tagger,
                                     templates=templates)
        brill_tagger = trainer.train(train_sents=training_data,
                                     max_rules=self.max_rules,
                                     min_score=self.min_score)
        if verbose == True:
            print "Initial Brill tagger accuracy:\n{}\n".format(
                    brill_tagger.evaluate(test_data))

        # folding
        for i in range(0, self.num_groups):
            # random splitting
            random.seed(len(self.tagged_data_list))
            random.shuffle(self.tagged_data_list)
            cutoff = int(self.dev_size * self.train_size)

            training_data = self.tagged_data_list[:cutoff]
            test_data = self.tagged_data_list[cutoff:self.dev_size]

            # note that .train method returns a BrillTagger() object
            brill_tagger = trainer.train(train_sents=training_data,
                                         max_rules=self.max_rules,
                                         min_score=self.min_score)

            if verbose == True:
                print "Brill tagger accuracy, fold {}:\n{}\n".format(
                        i+1, brill_tagger.evaluate(test_data))