예제 #1
0
def train_tagger():
    """
	This function trains the tagger
	"""
    print("Training POS tagger...")
    # https://github.com/japerk/nltk3-cookbook/blob/master/chapter4.py

    tagged_sentences = treebank.tagged_sents()
    size = int(len(tagged_sentences) * 0.9)
    train_sents = tagged_sentences[:size]
    test_sents = tagged_sentences[3000:]

    default = DefaultTagger("NN")
    tagger = ClassifierBasedPOSTagger(
        train=train_sents, backoff=default, cutoff_prob=0.3
    )
    print(tagger.evaluate(test_sents))  # 0.9613641269156055

    # save model to pickle file as binary
    file_name = MODEL_PATH + "tag_model.pkl"
    with open(file_name, "wb") as fout:
        pickle.dump(tagger, fout)

    print("model written to: " + file_name)
    print("")

    return tagger
예제 #2
0
def NER_HINDINBC():
    reader = TaggedCorpusReader('/python27/POS_9/', r'.*\.pos')
    f1 = reader.fileids()
    print "The Files of Corpus are:", f1
    sents = reader.tagged_sents()
    sentn = reader.sents()
    #words=sentn.split()
    ls = len(sents)
    #lw=len(words)
    print "Length of Corpus Is:", ls
    #print "The Words are:",lw
    size1 = int(ls * 0.3)
    test_sents = sents[:size1]
    train_sents = sents[size1:]
    nbc_tagger = ClassifierBasedPOSTagger(train=train_sents)
    test = nbc_tagger.evaluate(test_sents)
    print "The Test Result is:", test
    #THE GIVEN INPUT
    given_sent = "नीतीश कुमार द्वारा भाजपा के साथ हाथ मिलाने से वहां का पूरा राजनीतिक परिदृश्‍य ही बदल गया है मगर शरद यादव इससे खुश नहीं हैं".decode(
        'utf-8')
    gsw = given_sent.split()
    tag_gs = nbc_tagger.tag(gsw)
    print "GIVEN SENT TAG:", tag_gs
    ftag_gs = " ".join(list(itertools.chain(*tag_gs)))
    print "And its flattened Version is:", ftag_gs
예제 #3
0
def nbc_tagger():
    news_text = brown.tagged_sents(categories='news')
    train_sents = news_text[:3230]
    test_sents = news_text[3230:4600]
    nbc_tagger = ClassifierBasedPOSTagger(train=train_sents)
    test = nbc_tagger.evaluate(test_sents)
    print "The Test Results Is:", test
    sent3 = "Narendra Modi won Lok Sabha election with massive majority after long years"
    sent_w = sent3.lower().split()
    print sent_w
    tag = nbc_tagger.tag(sent_w)
    print "The Tag Is:", tag
예제 #4
0
 def load_tagger(self, name='backup/tagger.pickle'):
     try:
         with open(name, "rb") as f:
             tagger = pickle.load(f)
         f.close()
         return tagger
     except IOError as e:
         print ("I/O error: {0}".format(e))
         pass
     tagger = ClassifierBasedPOSTagger(train=self.__train_sents, backoff=self.__default, cutoff_prob=0.3)
     print 'Tagger accuracy : {}'.format(tagger.evaluate(self.__test_sents))
     with open(name, 'wb') as f:
         pickle.dump(tagger, f)
     f.close()
     return tagger
예제 #5
0
    def __init__(self, limit=300, debug=True):
        '''Instance the TrainingSetAnalyzer

        Keyword arguments:
        @param: limit size of the tweets which need to be analyzed (300)
        @param: debug flag for development process
        '''
        self.__debug = debug
        self.__limit = limit
        self.__speller = SpellChecker()
        self.__splitter = Splitter("rtw")
        self.__replacer = RegexpReplacer()
        self.__ngramHandler = NgramHandler()

        train_sents = treebank.tagged_sents()[:3000]
        self.__tagger = ClassifierBasedPOSTagger(train=train_sents)
예제 #6
0
파일: chunker.py 프로젝트: pratheeksh/NLP
def parse():
    tagger_classes=([nltk.UnigramTagger, nltk.BigramTagger])
    trained_sents, tagged_sents =  trainer("WSJ_02-21.pos-chunk","WSJ_23.pos")
    #tagger = nltk.UnigramTagger(trained_sents)
    print len(trained_sents)
    tagger = ClassifierBasedPOSTagger(train=trained_sents[:10000], classifier_builder=lambda train_feats: 
    MaxentClassifier.train(train_feats, trace = 0,max_iter=10))
    f = open("WSJ_23.chunk",'w')
        #print sents
    for sents in tagged_sents:
        (words,tags)=sents[0],sents[1]
        chunks = tagger.tag(tags)
        #print words, chunks
        wtc = zip(words, chunks)


        for tup in wtc:
	   f.write("%s\t%s\n" %(tup[0],tup[1][1]))

        f.write("\n")
예제 #7
0
def nltk_classifier_based_pos_tagger(input_dict):
    """
    A sequential tagger that uses a classifier to choose the tag for
    each token in a sentence.  The featureset input for the classifier
    is generated by a feature detector function::

        feature_detector(tokens, index, history) -> featureset

    Where tokens is the list of unlabeled tokens in the sentence;
    index is the index of the token for which feature detection
    should be performed; and history is list of the tags for all
    tokens before index.

    Construct a new classifier-based sequential tagger.

    :param training_corpus: A tagged corpus consisting of a list of tagged
        sentences, where each sentence is a list of (word, tag) tuples.
    :param backoff_tagger: A backoff tagger, to be used by the new tagger
        if it encounters an unknown context.

TODO: odloci se katerega se obdrzi od naslednjih dveh

    :param classifier_builder: A function used to train a new
        classifier based on the data in *train*.  It should take
        one argument, a list of labeled featuresets (i.e.,
        (featureset, label) tuples).
    :param classifier: The classifier that should be used by the
        tagger.  #This is only useful if you want to manually
        construct the classifier; normally, you would use *train*
        instead.
    :param backoff_tagger: A backoff tagger, used if this tagger is
        unable to determine a tag for a given token.
    :param cutoff_prob: If specified, then this tagger will fall
        back on its backoff tagger if the probability of the most
        likely tag is less than *cutoff_prob*.

    :returns pos_tagger: A python dictionary containing the POS tagger
        object and its arguments.
    """
    chunk = input_dict['training_corpus']['chunk']
    corpus = input_dict['training_corpus']['corpus']
    training_corpus=corpus_reader(corpus, chunk)
    backoff_tagger=input_dict['backoff_tagger']['object'] if input_dict['backoff_tagger'] else DefaultTagger('-None-')
    classifier=None #(input_dict['classifier'])
    cutoff_prob=int(input_dict['cutoff_prob']) if input_dict['cutoff_prob'] else None

    import nltk
    tagger_object=ClassifierBasedPOSTagger(train=training_corpus, classifier=classifier,
                 backoff=backoff_tagger, cutoff_prob=cutoff_prob)
    return {'pos_tagger': {
                'function':'tag_sents',
                'object': tagger_object
            }
    }
예제 #8
0
def myParse(sentence):
    print("ClassifierBasedPOSTagger tag:")
    brown_tagged_sents = brown.tagged_sents(categories='news')
    train_sents = brown_tagged_sents[:500000]
    tagger = ClassifierBasedPOSTagger(
        train=train_sents)  # , classifier_builder=MaxentClassifier.train)
    mytagger = SQLPosTagger(tagger)

    words = nltk.word_tokenize(sentence)
    result = mytagger.tag(words)
    print(result)
예제 #9
0
def parse():
    tagger_classes = ([nltk.UnigramTagger, nltk.BigramTagger])
    trained_sents, tagged_sents = trainer("WSJ_02-21.pos-chunk", "WSJ_23.pos")
    #tagger = nltk.UnigramTagger(trained_sents)
    print len(trained_sents)
    tagger = ClassifierBasedPOSTagger(
        train=trained_sents[:10000],
        classifier_builder=lambda train_feats: MaxentClassifier.train(
            train_feats, trace=0, max_iter=10))
    f = open("WSJ_23.chunk", 'w')
    #print sents
    for sents in tagged_sents:
        (words, tags) = sents[0], sents[1]
        chunks = tagger.tag(tags)
        #print words, chunks
        wtc = zip(words, chunks)

        for tup in wtc:
            f.write("%s\t%s\n" % (tup[0], tup[1][1]))

        f.write("\n")
예제 #10
0
def get_chunks(text_string):
    # tokenization
    print('Tokenising text...')
    sentences = sent_tokenize(text_string)
    tokenized_sentences = []
    for s in sentences:
        tokenized_sentences.append(word_tokenize(s))
    # PoS tagging
    train_sents = treebank.tagged_sents()
    print('Training PoS tagger...')
    tagger = ClassifierBasedPOSTagger(train=train_sents)
    tagged_sentences = []
    print('Tagging sentences...')
    for s in tokenized_sentences:
        tagged_sentences.append(tagger.tag(s))
    # chunking
    print('Getting trained chunk classifier...')
    chunk_classifier = get_trained_classifier()
    chunked_sentences = []
    print('Chunking sentences...')
    for s in tagged_sentences:
        chunked_sentences.append(chunk_classifier.parse(s))
    return chunked_sentences
    def wordTagger(self, wordlist,number):
        train_sents = treebank.tagged_sents()[:3000]
        if number==1:
            taglist = nltk.pos_tag(wordlist)
        elif number ==2:
            tagger = DefaultTagger('NN')
            taglist = tagger.tag(wordlist)
        elif number ==3:
            tagger = UnigramTagger(train_sents)
            taglist = tagger.tag(wordlist)

        elif number ==4:
            tnt_tagger = tnt.TnT()
            tnt_tagger.train(train_sents)
            taglist = tnt_tagger.tag(wordlist)
        elif number ==5:
            tagger = ClassifierBasedPOSTagger(train=train_sents)
            taglist = tagger.tag(wordlist)
        return taglist
예제 #12
0
#####
#
# 3 classified tagger
#
#####
from nltk.tag.sequential import ClassifierBasedPOSTagger
print("started classified")
class_tagger = None
try:
    with open('test_pickles/class.pickle', 'rb') as fa:
        class_tagger = pickle.load(fa)
except FileNotFoundError as a:
    # training data
    print("dumping class")
    class_tagger = ClassifierBasedPOSTagger(train=train)

    with open('test_pickles/class.pickle', 'wb') as fb:
        pickle.dump(class_tagger, fb)
#print(class_tagger.evaluate(test))
print(class_tagger.tag(tokenized_words))
####
#
# 4 TnT
#
####
print("started tnt")
from nltk.tag import tnt
tnt_tagger = None

try:
예제 #13
0
import nltk

from nltk.corpus import treebank

from nltk.tag import DefaultTagger

from nltk.tag.sequential import ClassifierBasedPOSTagger

default = DefaultTagger('NN')

train_sents = treebank.tagged_sents()[:3000]

test_sents = treebank.tagged_sents()[3000:]

tagger = ClassifierBasedPOSTagger(train=train_sents,
                                  backoff=default,
                                  cutoff_prob=0.3)

tagger.evaluate(test_sents)

#token = nltk.word_tokenize(title)  #title string tokenized

#removing all the punctuation  marks

#punctuation = re.compile(r'[-.?,":;()`~!@#$%^*()_=+{}]')

#tword = [punctuation.sub("", word) for word in token]

#print(tword) #without punctuation

#removing all the MS smart quotes
예제 #14
0
#adding the tagger

from nltk.corpus import treebank

from nltk.tag import DefaultTagger

from nltk.tag.sequential import ClassifierBasedPOSTagger

default = DefaultTagger('NN')

train_sents = treebank.tagged_sents()[:3000]

test_sents = treebank.tagged_sents()[3000:]

tagger = ClassifierBasedPOSTagger(train=train_sents,backoff=default, cutoff_prob = 0.3 )



#implementing it on the url names

ntag = tagger.tag(ntoken)


#extracting all the noun phrases from URL string

nlist = []

for word,tag in ntag:
	if (tag == 'NN'):
	
예제 #15
0
#adding the tagger

from nltk.corpus import treebank

from nltk.tag import DefaultTagger

from nltk.tag.sequential import ClassifierBasedPOSTagger


default = DefaultTagger('NN')

train_sents = treebank.tagged_sents()[:3000]

test_sents = treebank.tagged_sents()[3000:]

tagger = ClassifierBasedPOSTagger(train=train_sents,backoff=default, cutoff_prob = 0.3 )


#tagger.evaluate(test_sents)


#applying the tagger

htag = tagger.tag(hd_tokens)

print(htag)


#extracting all the noun phrases from raw string

nlist = []
예제 #16
0
# train
tic()
tnt_tagger = tnt.TnT()
tnt_tagger.train(train_sents)
tnt_eval['train_time'] = toc()
# test
tic()
tnt_eval['test_accuracy'] = tnt_tagger.evaluate(val_sents)
tnt_eval['test_time'] = toc()
# display results
display_training_metrics(tnt_eval)
""" 2. Naive Bayes classifier tagger """
nb_eval = dict()
# train
tic()
nb_tagger = ClassifierBasedPOSTagger(train=train_sents)
nb_eval['train_time'] = toc()
# test
tic()
nb_eval['test_accuracy'] = nb_tagger.evaluate(val_sents)
nb_eval['test_time'] = toc()
# display results
display_training_metrics(nb_eval)
""" 3. Naive Bayes classifier tagger with features """
nb_eval = dict()
# train
tic()
nb_tagger = ClassifierBasedTagger(train=train_sents,
                                  feature_detector=add_features)
nb_eval['train_time'] = toc()
# test
예제 #17
0
from nltk.tag.sequential import ClassifierBasedPOSTagger
from tag_util import train_sents, test_sents

tagger = ClassifierBasedPOSTagger(train=train_sents)
print(tagger.evaluate(test_sents))
예제 #18
0
# print( 'Training TnT...' )
# tnt_tagger = tnt.TnT()
# tnt_tagger.train(train_corpus)
# print( 'Testing...' )
# acc = tnt_tagger.evaluate(test_corpus)
# print( 'TnT accuracy={0}\n'.format(acc) )
#
# # ----------------------------------------------------------------------
#
# print( 'Training UnigramTagger...' )
# unigram_tagger = UnigramTagger(train_corpus)
# with open( 'unigram.pos_tagger.pickle', 'wb' ) as f:
#     pickle.dump( unigram_tagger, f )
#
# print( 'Testing...' )
# acc = unigram_tagger.evaluate(test_corpus)
# print( 'UnigramTagger accuracy={0}\n'.format(acc) )

# ----------------------------------------------------------------------

print('Training ClassifierBasedPOSTagger...')
cbt = ClassifierBasedPOSTagger(train=train_corpus)
print('Testing...')
acc = cbt.evaluate(test_corpus)
print('accuracy={0}\n'.format(acc))

print('Storing...')
with open(os.path.join(model_folder, 'ClassifierBasedPOSTagger.pickle'),
          'wb') as f:
    pickle.dump(cbt, f)
예제 #19
0
class TrainingSetAnalyzer():
    '''
    This class handles the setting of the training set data
    and provides support for features exctraction given a text'''
    def __init__(self, limit=300, debug=True):
        '''Instance the TrainingSetAnalyzer

        Keyword arguments:
        @param: limit size of the tweets which need to be analyzed (300)
        @param: debug flag for development process
        '''
        self.__debug = debug
        self.__limit = limit
        self.__speller = SpellChecker()
        self.__splitter = Splitter("rtw")
        self.__replacer = RegexpReplacer()
        self.__ngramHandler = NgramHandler()

        train_sents = treebank.tagged_sents()[:3000]
        self.__tagger = ClassifierBasedPOSTagger(train=train_sents)

    def __analyzeSingleTweet(self, tweet):
        '''
        Helper function to get unigrams, emoticons, ngrams given a text

        Keyword arguments:
        @param: tweet the tweet to be analyzed
        '''

        chunks = self.__splitter.split(u'' + tweet)
        raw_feature_list_neg = []
        emot_list = []
        ngrams = []
        for subTweet in chunks:
            try:
                preprocessed_tweet = self.__replacer.preprocess(subTweet)
                acr_expanded, tmp_emot_list = self.__replacer \
                    .acr_emot_exctractor(preprocessed_tweet)
                emot_list += tmp_emot_list
                enanched_txt = self.__speller.check_and_replace(acr_expanded)
                tagged_sent = self.__tagger.tag(enanched_txt)
                raw_feature_list_neg += self.__replacer \
                    .filter_raw_feature_list(
                        acr_expanded)
                ngrams += self.__ngramHandler.exctract_ngrams(tagged_sent)
            except Exception as e:
                print "Sorry, something goes wrong: %s txt: %s" \
                    % (str(e), tweet)

        return raw_feature_list_neg, emot_list, ngrams

    def analyze(self):
        '''
            Analyzes a set of tweets
            '''
        print "Found %i elments for training" % self.__limit
        n = 0
        while n < 20:
            qs = get_tweets_for_analyzing(skip=n)
            for tweet in qs:
                raw_feature_list_neg, emot, ngrams = self.__analyzeSingleTweet(
                    tweet.text)
                if not self.__debug:
                    print "saving...."
                    tweet.set_features(raw_feature_list_neg, emot, ngrams)
            n += 1
        return

    def extract_features_for_classification(self, text):
        '''
            Helper function to exctract features given a text

            Keyword arguments:
            @param: text the text whose the features will be exctracted
            '''
        raw_feature_list_neg, emot_list, ngrams = self.__analyzeSingleTweet(
            text)
        return raw_feature_list_neg, emot_list, ngrams, dict([
            (word, True) for word in raw_feature_list_neg + emot_list + ngrams
        ])

    def purge_useless_features(self):
        '''Helper function to prune less frequent unigram features'''

        tweets = get_tweets_for_pruning()
        print "Pruning process for %i tweets" % tweets.count()
        mrt = tweets.map_reduce(mapfunc_filter, reducefunc, "cn")
        mrt = filter(lambda status: status.value > PURGE_TRESHOLD, mrt)
        purged_qs = [item.key for item in mrt]
        for tweet in tweets:
            try:
                tweet.features.filtered_unigram = [
                    item for item in purged_qs
                    if item in tweet.features.raw_feature_list_neg
                ]
                tweet.save()
            except Exception, e:
                print e
        print "Done!"
예제 #20
0
            'a': 'JJ',
            'r': 'RB',
            'v': 'VB'
        }
        self.fd = FreqDist(treebank.words())

    def choose_tag(self, tokens, index, history):
        """
            Choses a POS tag based on the wordnet tag
        """

        word = tokens[index]
        for synset in wordnet.synsets(word):
            self.fd[synset.pos()] += 1
        return self.wordnet_tag_map.get(self.fd.max())


# Using the wordnet tagger
wn_tagger = WordNetTagger()
accuracy = wn_tagger.evaluate(test_sents)
print(f"Accuracy of the wordnet tagger: {accuracy}\n")

# Classifier tagging
cl_tagger = ClassifierBasedPOSTagger(train=train_sents)
accuracy = cl_tagger.evaluate(test_sents)
print(f"Accuracy of the classifier tagger: {accuracy}\n")

# Saving pickle - Heavy one
with open('pickles/pos-taggers/classifier_tagger.pickle', 'wb') as file:
    pickle.dump(cl_tagger, file)
예제 #21
0

#%%
# combined tagger with a list of taggers and use a backoff tagger
def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff


ct = combined_tagger(train_data=train_data,
                     taggers=[UnigramTagger, BigramTagger, TrigramTagger],
                     backoff=rt)

# evaluating the new combined tagger with backoff taggers
print(ct.evaluate(test_data))
print(ct.tag(nltk.word_tokenize(sentence)))

#%%
## Training using Supervised classification algorithm

from nltk.classify import NaiveBayesClassifier, MaxentClassifier
from nltk.tag.sequential import ClassifierBasedPOSTagger

nbt = ClassifierBasedPOSTagger(train=train_data,
                               classifier_builder=NaiveBayesClassifier.train)

# evaluate tagger on test data and sample sentences
print(nbt.evaluate(test_data))
print(nbt.tag(nltk.word_tokenize(sentence)))
예제 #22
0
#adding the tagger

from nltk.corpus import treebank

from nltk.tag import DefaultTagger

from nltk.tag.sequential import ClassifierBasedPOSTagger

default = DefaultTagger('NN')

train_sents = treebank.tagged_sents()[:3000]

test_sents = treebank.tagged_sents()[3000:]

tagger = ClassifierBasedPOSTagger(train=train_sents,
                                  backoff=default,
                                  cutoff_prob=0.3)

#applying the tagger

rawtag = tagger.tag(clean)

print rawtag

#extracting all the noun phrases from raw string

nlist = []

for word, tag in rawtag:
    if (tag == 'NN'):
예제 #23
0
# adding the tagger

from nltk.corpus import treebank

from nltk.tag import DefaultTagger

from nltk.tag.sequential import ClassifierBasedPOSTagger


default = DefaultTagger("NN")

train_sents = treebank.tagged_sents()[:3000]

test_sents = treebank.tagged_sents()[3000:]

tagger = ClassifierBasedPOSTagger(train=train_sents, backoff=default, cutoff_prob=0.3)


# tagger.evaluate(test_sents)


# applying the tagger

htag = tagger.tag(hd_tokens)

print(htag)


# extracting all the noun phrases from raw string

nlist = []
예제 #24
0
import nltk
from nltk.corpus import treebank
from nltk.tag import DefaultTagger

from nltk.classify import NaiveBayesClassifier, MaxentClassifier
from nltk.tag.sequential import ClassifierBasedPOSTagger

data = treebank.tagged_sents()
train_data = data[:3500]
test_data = data[3500:]
#print(train_data[0])

dt = DefaultTagger('NN')
print(dt.evaluate(test_data))

nt = ClassifierBasedPOSTagger(train=train_data,
                              classifier_builder=NaiveBayesClassifier.train)
print(nt.evaluate(test_data))
def classify_based_tag_train():
	train_sents = treebank.tagged_sents()[:5000]
	#train_sents = brown.tagged_sents(categories='learned', tagset='universal')
	bigram_tagger = BigramTagger(train_sents)
	cbtagger = ClassifierBasedPOSTagger(train=train_sents, backoff = bigram_tagger)
	pickle.dump(cbtagger, open( 'my_tagger.pkl', 'wb' ) )
예제 #26
0
#tagger

from nltk.corpus import treebank

from nltk.tag import DefaultTagger

from nltk.tag.sequential import ClassifierBasedPOSTagger


default = DefaultTagger('NN')

train_sents = treebank.tagged_sents()[:3000]

test_sents = treebank.tagged_sents()[3000:]

tagger = ClassifierBasedPOSTagger(train=train_sents,backoff=default, cutoff_prob = 0.3 )


#tagger.evaluate(test_sents)


#applying the tagger

rtag = tagger.tag(r)

print(rtag)

#extracting all the noun phrases from raw string

nlist = []
예제 #27
0
  #Regexp - best to treat numbers? 
  regexp_tagger = RegexpTagger(patterns, backoff=nt)
  treebank_tagger = UnigramTagger(model=model,backoff=regexp_tagger)

  #skipping affix
  
  #skipping brill
  
  #TnT
  #Tried on 9/24 took a long time on evaluting accuracy
  #tagger = tnt.TnT(unk=backoff,Trained=True)
  #tagger.train(train_sents)

  #Used Classifier tagger because of accuracy. Could play around with cutoff probability for using
  #backoff tagger
  tagger = ClassifierBasedPOSTagger(train=train_sents,backoff=regexp_tagger,cutoff_prob=0.3)

  print("Writing new tagger.pickle")
  f = open('tagger.pickle','w')
  pickle.dump(tagger,f)
  f.close()
else:
  print("Opening existing tagger.pickle")
  f = open('tagger.pickle','r')
  tagger = pickle.load(f)

#Chunker
train_new_chunker = True
if train_new_chunker:
  train_chunks = treebank_chunk.chunked_sents()[:3000]
  conll_train= conll2000.chunked_sents('train.txt')
예제 #28
0
#adding the tagger

from nltk.corpus import treebank

from nltk.tag import DefaultTagger

from nltk.tag.sequential import ClassifierBasedPOSTagger

default = DefaultTagger('NN')

train_sents = treebank.tagged_sents()[:3000]

test_sents = treebank.tagged_sents()[3000:]

tagger = ClassifierBasedPOSTagger(train=train_sents,backoff=default, cutoff_prob = 0.3 )


#applying the tagger

rawtag = tagger.tag(clean)

print rawtag

#extracting all the noun phrases from raw string

nlist = []

for word,tag in rawtag:
	if (tag == 'NN'):
	
from nltk.tag.sequential import ClassifierBasedPOSTagger
import pickle

datas = open('Indonesian_Manually_Tagged_Corpus.tsv', 'r').read()
datas = datas.split('\n\n')

train_sents = []

for data in datas:
    train_sents.append(list(tuple(i.split('\t')) for i in data.split('\n')))

tagger = ClassifierBasedPOSTagger(train=train_sents)
tagger_files = open("indonesian_classifier_pos_tag.pickle", "wb")
pickle.dump(tagger, tagger_files)
tagger_files.close()
예제 #30
0
print tt.evaluate(test_data)
print tt.tag(tokens)

def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff

ct = combined_tagger(train_data=train_data, 
                     taggers=[UnigramTagger, BigramTagger, TrigramTagger],
                     backoff=rt)

print ct.evaluate(test_data)        
print ct.tag(tokens)

from nltk.classify import NaiveBayesClassifier, MaxentClassifier
from nltk.tag.sequential import ClassifierBasedPOSTagger

nbt = ClassifierBasedPOSTagger(train=train_data,
                               classifier_builder=NaiveBayesClassifier.train)

print nbt.evaluate(test_data)
print nbt.tag(tokens)    


# try this out for fun!
met = ClassifierBasedPOSTagger(train=train_data,
                               classifier_builder=MaxentClassifier.train)
print met.evaluate(test_data)                           
print met.tag(tokens)
예제 #31
0
import nltk

from nltk.corpus import treebank

from nltk.tag import DefaultTagger

from nltk.tag.sequential import ClassifierBasedPOSTagger

default = DefaultTagger('NN')

train_sents = treebank.tagged_sents()[:3000]

test_sents = treebank.tagged_sents()[3000:]

tagger = ClassifierBasedPOSTagger(train=train_sents,backoff=default, cutoff_prob = 0.3 )


tagger.evaluate(test_sents)

#token = nltk.word_tokenize(title)  #title string tokenized

#removing all the punctuation  marks

#punctuation = re.compile(r'[-.?,":;()`~!@#$%^*()_=+{}]')

#tword = [punctuation.sub("", word) for word in token]

#print(tword) #without punctuation

#removing all the MS smart quotes