示例#1
0
def question3():
    tagger = CRFTagger(feature_func=feature_func)

    tagger.train(train_sentences, 'model.crf.tagger')

    print(tagger.evaluate(test_sentences))
    return
示例#2
0
文件: crf_baseline.py 项目: qdtn/SCL
def run_crf(trainfile, testfile, model_file=None):

    maxlen = 100
    sents_train, tags_train, unique_words_train, unique_tags_train = \
        P.retrieve_sentences_tags(trainfile, maxlen=maxlen)
    sents_test, tags_test, unique_word_test, unique_tags_test = \
        P.retrieve_sentences_tags(testfile, maxlen=maxlen, allowedtags=unique_tags_train)

    train_data = []
    for n, st in enumerate(sents_train):
        s = []
        for m, _ in enumerate(st):
            s.append((unicode(sents_train[n][m], "utf-8")
                      , unicode(tags_train[n][m], "utf-8")))
        train_data.append(s)

    crf = CRFTagger()
    if model_file is None:
        crf.train(train_data, model_file='data/crf.mdl')
    else:
        crf.set_model_file(model_file)

    test_data = []
    for n, st in enumerate(sents_test):
        s = []
        for m, _ in enumerate(st):
            s.append((unicode(sents_test[n][m], "utf-8")
                      , unicode(tags_test[n][m], "utf-8")))
        test_data.append(s)

    print(crf.evaluate(test_data))
示例#3
0
def cltk_pos_cv(full_training_set, local_dir_rel, counter):
    local_dir = os.path.expanduser(local_dir_rel)
    
    stdout_old = sys.stdout
    
    sys.stdout = open(os.path.join(local_dir, 'test_%d.out'%counter), 'w')  
    
    # read POS corpora
    print("local_dir", local_dir)
    train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos'%counter)
    train_sents = train_reader.tagged_sents()

    test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos'%counter)
    test_sents = test_reader.tagged_sents()
    
    print('Loop #' + str(counter))
    
    sys.stdout.flush()
    
    # make crf tagger
    crf_tagger = CRFTagger()
    crf_tagger.train(train_sents, 'model.crf.tagger')

    #crf_tagger = UnigramTagger(train_sents)
    
    # evaluate crf tagger
    crf_accuracy = None
    crf_accuracy = crf_tagger.evaluate(test_sents)
    print('crf:', crf_accuracy)
    
    sys.stdout = stdout_old
示例#4
0
def cltk_pos_cv(full_training_set, local_dir_rel, counter):
    local_dir = os.path.expanduser(local_dir_rel)

    stdout_old = sys.stdout

    sys.stdout = open(os.path.join(local_dir, 'test_%d.out' % counter), 'w')

    # read POS corpora
    print("local_dir", local_dir)
    train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos' % counter)
    train_sents = train_reader.tagged_sents()

    test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos' % counter)
    test_sents = test_reader.tagged_sents()

    print('Loop #' + str(counter))

    sys.stdout.flush()

    # make crf tagger
    crf_tagger = CRFTagger()
    crf_tagger.train(train_sents, 'model.crf.tagger')

    #crf_tagger = UnigramTagger(train_sents)

    # evaluate crf tagger
    crf_accuracy = None
    crf_accuracy = crf_tagger.evaluate(test_sents)
    print('crf:', crf_accuracy)

    sys.stdout = stdout_old
示例#5
0
def question3():

    tagger = CRFTagger(feature_func=feature_func)
    tagger.train(train_sentences, 'model_windows_size_1.crf.tagger')

    #tagger = CRFTagger(feature_func=feature_func)
    #tagger.set_model_file('model_windows_size_1.crf.tagger')

    print(tagger.evaluate(test_sentences))
    return
示例#6
0
def main():
    import pickle
    from nltk.tag import CRFTagger
    infolist = pickle.load(open('infolist.pickle', 'rb'))

    ct = CRFTagger()

    train_data = [[(x, z)
                   for [x, y, z] in infolist[:round(0.9 * len(infolist))]]]

    ct.train(train_data, 'model.crf.tagger')
    ners = ct.tag_sents(
        [[x for [x, y, z] in infolist[round(0.9 * len(infolist)):]]])
    print(ners)

    gold_sentences = [[(x, z)
                       for [x, y, z] in infolist[round(0.9 * len(infolist)):]]]

    ct.evaluate(gold_sentences)
    print(ct.evaluate(gold_sentences))
示例#7
0
def crf_tag():
    news_text = brown.tagged_sents(categories='news')
    train_sents = news_text[:3230]
    test_sents = news_text[3230:4600]
    ct = CRFTagger()
    tagger = ct.train(train_sents, 'model.crf.tagger')
    test = ct.evaluate(test_sents)
    print test
    sent3 = "Narendra Modi won Lok Sabha election with massive majority after long years".decode(
        'utf-8')
    sent_w = sent3.lower().split()
    print sent_w
    tag = ct.tag(sent_w)
    print "The Tag Is:", tag
示例#8
0
def ontweetdata():
    tweetinfolist = pickle.load(open('tweetinfolist.pickle',
                                     'rb'))  #data from tweets
    counter = 0
    for item in tweetinfolist:
        if item[1] == "O":
            counter = counter + 1
    print("BASELINE: ", (counter) / len(tweetinfolist))

    ct = CRFTagger()
    train_data = [[
        (x.lower(), y.lower())
        for [x, y] in tweetinfolist[round(0.9 * len(tweetinfolist)):]
    ]]
    ct.set_model_file('model.crf.tagger')
    ct.train(train_data, 'model.crf.tagger')

    gold_sentences = [[
        (x.lower(), y.lower())
        for [x, y] in tweetinfolist[:round(0.9 * len(tweetinfolist))]
    ]]

    print(ct.evaluate(gold_sentences))
示例#9
0
def oninfolist():
    """NU DOEN: KIJK NAAR FORMAT VAN GEGEVEN INFORMATIE OP INTERNET IN VOORBEELD, CHECK ALLE LIJSTEN DIE IK GEMAAKT HEB OF ZE OVEREENKOMEN MET DE VORM"""

    #SEE: http://www.nltk.org/_modules/nltk/tag/crf.html
    infolist = pickle.load(open('sentencelist.pickle', 'rb'))
    limit = round(len(infolist) * 0.4)
    train_data = infolist[0:limit]
    #print("train_data = ", train_data[0:10])

    ct = CRFTagger()
    #print(infolist[0:10])

    realsentences = []
    realsentence = ""
    """
	for sentence in infolist[limit:]:
		for (word,nertag) in sentence:
			realsentence = realsentence +" "+ word
		realsentences.append(realsentence)
		realsentence = ""
	pickle.dump(realsentences,open("realsentences.pickle","wb"))
	print("pickle-bestand gemaakt")
	"""
    realsentences = pickle.load(open("realsentences.pickle", "rb"))
    print("REALSENTENCES:", realsentences[0:10])
    splitsentences = []  #[['dog','is','good'],['cat','eat','meat']]
    for r in realsentences:
        splitsentence = r.split()
        splitsentences.append(splitsentence)

    #print("train_data:", infolist[0:10])
    #print("sentences for tag_sents:", splitsentences[0:10])
    ct.tag_sents(splitsentences[limit:])
    gold_sentences = infolist[limit:]
    print("GOLD SENTENCES:", infolist[10:20])
    print(ct.evaluate(gold_sentences))
示例#10
0
from nltk.tag import CRFTagger
ct = CRFTagger()

train_data = [[('Universiteit', 'Noun'), ('is', 'Verb'), ('een', 'Det'),
               ('goed', 'Adj'), ('goede', 'Adj'), ('plek', 'Noun'),
               ('hond', 'Noun'), ('eet', 'Verb'), ('vlees', 'Noun')]]

ct.train(train_data, 'model.crf.tagger')
ct.tag_sents([['hond', 'is', 'goed'], ['kat', 'eet', 'vlees']])

gold_sentences = [[('hond', 'Noun'), ('is', 'Verb'), ('goed', 'Adj')],
                  [('kat', 'Noun'), ('eet', 'Verb'), ('vlees', 'Noun')]]
ct.evaluate(gold_sentences)

ct = CRFTagger()
ct.set_model_file('model.crf.tagger')
print(ct.evaluate(gold_sentences))
示例#11
0
class DataAdapter(object):
    def __init__(self, data=[]):
        self.tagger = CRFTagger()
        self.tagger.set_model_file('model.crf.tagger')

        if data.count(True) > 0:
            self.data_tagging, self.data_testing = self.for_tagging_testing(
                data)
            # print('TAGGING', self.data_tagging)
            # print('TESTING', self.data_testing)

    def tokenize_tag(self, text):
        text = text.replace('\r', ' | ').replace('\n', ' | ')
        tokens = word_tokenize(text, preserve_line=True)
        labels = []
        for label in self.tag(tokens):
            labels.append(label[1])
        return tokens, labels

    def for_tagging_testing(self, data):
        # self.data = data
        array_tagging = []
        array_testing = []
        for d in data:
            all_tags = []
            all_test = []
            for index, t in enumerate(d['text']):
                one_tag = [t, d['label'][index]]
                all_test.append(one_tag)
                all_tags.append(t)
            array_tagging.append(all_tags)
            array_testing.append(all_test)
            # print(all_tags)
        return array_tagging, array_testing

    def for_testing(self, data):
        # self.data = data
        array = []
        # print('TEST', data.count())
        for d in data:
            all_tags = []
            for index, t in enumerate(d['text']):
                # one_tag = [t, (d['label'][index] if is_ascii(d['label'][index]) else 'O')]
                one_tag = [t, d['label'][index]]
                all_tags.append(one_tag)
            array.append(all_tags)
            # print(all_tags)
        return array

    def for_tagging(self, data):
        # self.data = data
        array = []
        for d in data:
            all_tags = []
            for t in d['text']:
                all_tags.append(t)
            array.append(all_tags)
            # print(all_tags)
        return array

    def tag_sents(self):
        if self.data_tagging is not None:
            return self.tagger.tag_sents(self.data_tagging)
        else:
            return 'NoData'

    def tag(self, data):
        return self.tagger.tag(data)

    def evaluate(self):
        if self.data_testing is not None:
            return self.tagger.evaluate(self.data_testing)
        else:
            return 'NoData'

    def train(self, data):
        data = self.for_testing(data)
        self.tagger.train(data, 'model.crf.tagger')
        print('ACCURACY:', self.tagger.evaluate(data))
示例#12
0
文件: own_model.py 项目: Elixeus/NLP
from nltk.corpus import treebank
from nltk.tag import tnt, CRFTagger


# split training data from test data
train_data = treebank.tagged_sents()[:3000]
test_data = treebank.tagged_sents()[3000:]

# train a trigram N tagger (TnT)
tnt_pos_tagger = tnt.TnT()
tnt_pos_tagger.train(train_data)
print tnt_pos_tagger.evaluate(test_data)

# train a CRF tagger
crf_tagger = CRFTagger()
crf_tagger.train(train_data,
                 '~/Documents/NLP/NLP/crf_model.txt')
print crf_tagger.evaluate(test_data)
示例#13
0
# -*- coding: utf-8 -*-
from nltk.tag import CRFTagger
import re
import codecs
def get_data():
	with codecs.open('th_pud-ud-test.conllu', 'r',encoding='utf8') as f:
		lines = f.read()
	return re.split("#(.*)+[\r\n]#(.*)+[\r\n]",lines)
data=get_data()
i=0
data_all=[]
while i<len(data):
	data_list=[]
	for r in re.split('\n',data[i]):
		t=[x for x in re.split('\t',r) if x!='']
		if t!=[]:
			data_list.append((t[1],t[3]))
	data_all.append(data_list)
	i+=1
train_data=[x for x in data_all if x!=[]]
ct = CRFTagger()
ct.set_model_file('model.crf.tagger')
print(ct.evaluate(train_data))
示例#14
0

# In[ ]:


tagger = BigramTagger(train_reducido[:1000])
tagger.evaluate(test_reducido[:1000])
entrenar_bill(tagger,"BigramTagger")


# In[ ]:


ct = CRFTagger()
ct.train(train_reducido[:1000],'model.crf.tagger')
evaluacion = ct.evaluate(test_reducido[:1000])
xlabels.append("CRF Tagger")
accuracys.append(evaluacion)


# In[ ]:


tagger = PerceptronTagger(load=False)
tagger.train(train_reducido[:1000])
evaluacion = tagger.evaluate(test_reducido[:1000])
xlabels.append("Perceptron Tagger")
accuracys.append(evaluacion)


# In[ ]:
示例#15
0
    language = page.find('language').text.decode('utf8')
    pos = page.find('pos_tags').text.decode('utf8')
    splitText = text.split(" ")[1:-1]
    posText = pos.split(" ")[1:-1]
    for i in range(len(splitText)):
        l.append((splitText[i], posText[i]))
    data.append(l)
    count = count + 1
shuffle(data)
print len(data)
# Divide data into train and test sets
eightyPercent = count * 0.9
training_set = data[0:int(eightyPercent)]
test_set = data[int(eightyPercent):]
print training_set
# Train
ct = CRFTagger()
train_data = training_set
ct.train(train_data, 'model.crf.tagger')

# Accuracy
gold_sentences = test_set
print ct.evaluate(gold_sentences)

print "Give a sentence..."
# Test
test_sent = raw_input()
test_sent = test_sent.encode('utf-8').decode('utf-8').split(' ')
# print test_sent
print ct.tag_sents([test_sent])
示例#16
0
print "\nReading training corpus...."
ListOfSentences_Training = corpusRead(Training_Data)
print "Reading test corpus...."  
ListOfSentences_Test = corpusRead(Test_Data)




#CRF Training
ct = CRFTagger()
print "CRF Training starts..."
ct.train(ListOfSentences_Training,'model.crf.tagger')
print "CRF Training is done."

print "Testing starts"
print "Accuracy of CRF is = ",ct.evaluate(ListOfSentences_Test) * 100
#Tagging by CRF Tagger
ch = 'y'
while (ch != 'n'):
    text = raw_input("Enter the text to be tagged : \n")
    text = converter(text)
    print ct.tag_sents(text)
    print "\nDo you want to continue ?"
    ch = raw_input()
 



#HMM Training

print "HMM Training using HiddenMarkovModelTrainer() starts.."
示例#17
0
from nltk.tag import CRFTagger
from nltk.corpus import brown

tagged_sents = brown.tagged_sents()
train = tagged_sents[:50000]
test = tagged_sents[50000:]

crf = CRFTagger()
crf.train(train, 'crf_tagger.model')
a = crf.evaluate(test)
print a
示例#18
0
def cltk_pos_cv(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    crf_accuracies = []
    
    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i+n]

    # a list of 10 lists
    ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = ten_parts[counter]  # or: test_set = part
        
        # filter out this loop's test index
        training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
        
        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [item for sublist in training_set_lists for item in sublist]
            
        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test.pos')
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train.pos')
        with open(train_path, 'w') as f:
            f.write('\n\n'.join(training_set))

        # read POS corpora
        print("local_dir", local_dir)
        train_reader = TaggedCorpusReader(local_dir, 'train.pos')
        train_sents = train_reader.tagged_sents()

        test_reader = TaggedCorpusReader(local_dir, 'test.pos')
        test_sents = test_reader.tagged_sents()
        
        print('Loop #' + str(counter))
        # make crf tagger
        crf_tagger = CRFTagger()
        crf_tagger.train(train_sents, 'model.crf.tagger')
        
        # evaluate crf tagger
        crf_accuracy = None
        crf_accuracy = crf_tagger.evaluate(test_sents)
        crf_accuracies.append(crf_accuracy)
        print('crf:', crf_accuracy)

        #if counter> 0: break
        
    final_accuracies_list = []
    mean_accuracy_crf = mean(crf_accuracies)
    standard_deviation_crf = stdev(crf_accuracies)
    uni = {'crf': {'mean': mean_accuracy_crf, 'sd': standard_deviation_crf}}
    final_accuracies_list.append(uni)

    final_dict = {}
    for x in final_accuracies_list:
        final_dict.update(x)
    
    return final_dict
示例#19
0
        line=f.readline()
    f.close()

res = ct.tag_sents(test_sentences)
tagged_result = []
tagged_actual = []
for i in range(len(res)):
   for j in range(len(res[i])):
	tagged_result.append(res[i][j][1])
	tagged_actual.append(test_actual[i][j][1])
print res[0]
print test_actual[0]
#print tagged_result[0]
#print tagged_actual[0]

gold_sentences=test_actual
accuracy = ct.evaluate(gold_sentences)
print "accuracy:"+str(accuracy)

#recall = nltk.metrics.scores.recall(test_actual,res)
precision = precision_score(tagged_actual,tagged_result)
print "precision:"+str(precision)

recall = recall_score(tagged_actual,tagged_result)
print "recall:"+str(recall)


f1 = f1_score(tagged_actual,tagged_result)
print "F1_score:"+str(f1)

示例#20
0
文件: crftagger.py 项目: s2589036/ltp
def onsentencelist():
        ct = CRFTagger()

        """sentencelist contains nertaged sentences"""
        sentencelist = pickle.load(open('sentencelist.pickle','rb'))

        """training size as percentage"""
        trainingsize = 0.9

        """ calculate where to split data """
        limit = round(trainingsize*len(sentencelist))

        """wordsentencelist contains the same sentences not ner-tagged"""
        wordsentencelist = pickle.load(open("wordsentencelist.pickle","rb"))

        
        """train the data / choose one of the 2 blocks """
        #train_data = sentencelist[:limit]
        #ct.train(train_data,'model.crf.tagger')
        ct.set_model_file('tweetmodel.crf.tagger')
        

        """Test data and evaluate"""
        test_data = wordsentencelist[limit:]
        ct.tag_sents(test_data) # tagging sentences
        gold_sentences = sentencelist[limit:]
        print("\nAccuracy:", ct.evaluate(gold_sentences))


        """ TURN TRAINED TAGGED LIST AND TEST LIST INTO ONE LIST CONTAINING
        ONLY THE TRUE AND PREDTAGS"""
        pred_nerlist = []
        for sentence in wordsentencelist[:limit]:
                for (word,nertag) in ct.tag(sentence):
                        #pred_nerlist.append((word,nertag))
                        pred_nerlist.append(nertag.lower())
                        
        true_nerlist = []
        #ct_true = gold_sentences
        for sentence in sentencelist[:limit]:
                for (word,nertag) in sentence:
                        #true_nerlist.append((word,nertag))
                        true_nerlist.append(nertag.lower())
        
        """ Print baseline """
        #print("\nBaseline = 0.9048987094135446 (everything tagged O)")

        
        """"Print F-score and confusion matrix """
        #print(len(pred_nerlist))
        #print(len(true_nerlist))
        
        """"Print F-score and confusion matrix """        
        print("\nF-score (micro):", f1_score(true_nerlist, pred_nerlist, average='micro') )
        print("\nF-score (macro):", f1_score(true_nerlist, pred_nerlist, average='macro') )
        print("\nF-score (weigthed):", f1_score(true_nerlist, pred_nerlist, average='weighted') )
        print("\nF-score (None):", f1_score(true_nerlist, pred_nerlist, average=None, labels=["o","b-per","i-per","b-loc","i-loc","b-org","i-org","b-misc","i-misc"]))
        
        
        print("\nConfusion matrix:\n")
        for item in ["O","B-per","I-per","B-loc","I-loc","B-org","I-org","B-misc","I-misc"]: print("  ",item,end="")
        print("\n",confusion_matrix(true_nerlist, pred_nerlist,labels = ["o","b-per","i-per","b-loc","i-loc","b-org","i-org","b-misc","i-misc"]))
    train_data.append(lists_e)

print('Training the CRF++ model started')
ct.train(train_data,
         'model.crf.tagger')  # training the crf model using the training set
print('Training the CRF++ model completed')

test_number_lines = 0
print('Reading Unseen Data')
for lines in test_set:
    test_number_lines = test_number_lines + 1
    print('Processing file', test_number_lines)
    xx = (lines.split())
    test_data.append(xx)

tagged_sent = ct.tag_sents(test_data)  # tagging the the unseen data

for tag_set in tagged_sent:
    tagged_text.write(str(tag_set))
    tagged_text.write('\n')

tagged_text.close()
print('Tagging the Unseen data Completed')

for vz in gold_line:
    vz = "[" + vz + "]"
    gold_list = ast.literal_eval(vz)
    gold_data.append(gold_list)

print('The Total Accuracy of the System is:' + str(ct.evaluate(gold_data)))