Exemplo n.º 1
0
def train_pos_tag(dataset_dir, output_path):
    jumSample = 500000
    namaFile = dataset_dir
    with open(namaFile, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')

    pasangan = []
    allPasangan = []

    for line in lines[:min(jumSample, len(lines))]:
        # Remove Wiki Tags
        line = re.sub('<[^>]*>', '', line)
        if line == '':
            if len(pasangan) != 0:
                allPasangan.append(pasangan)
            pasangan = []
        else:
            kata, tag = line.split('\t')
            p = (kata, tag)
            pasangan.append(p)

    ct = CRFTagger()
    print("Training Tagger...")
    ct.train(allPasangan, output_path)
    print("Training Complete")
Exemplo n.º 2
0
def cltk_pos_cv(full_training_set, local_dir_rel, counter):
    local_dir = os.path.expanduser(local_dir_rel)
    
    stdout_old = sys.stdout
    
    sys.stdout = open(os.path.join(local_dir, 'test_%d.out'%counter), 'w')  
    
    # read POS corpora
    print("local_dir", local_dir)
    train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos'%counter)
    train_sents = train_reader.tagged_sents()

    test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos'%counter)
    test_sents = test_reader.tagged_sents()
    
    print('Loop #' + str(counter))
    
    sys.stdout.flush()
    
    # make crf tagger
    crf_tagger = CRFTagger()
    crf_tagger.train(train_sents, 'model.crf.tagger')

    #crf_tagger = UnigramTagger(train_sents)
    
    # evaluate crf tagger
    crf_accuracy = None
    crf_accuracy = crf_tagger.evaluate(test_sents)
    print('crf:', crf_accuracy)
    
    sys.stdout = stdout_old
Exemplo n.º 3
0
class NamedEntityChunker(ChunkParserI):
  def __init__(self, train_sents, **kwargs):
    assert isinstance(train_sents, Iterable)
 
    self.feature_detector = features
    self.tagger = CRFTagger(
      feature_func=features
    )
    self.tagger.train(train_sents, 'model.crf.tagger')

    # self.tagger = ClassifierBasedTagger(
    #   train=train_sents,
    #   feature_detector=features,
    #   **kwargs)
 
  def parse(self, tagged_sent):
    chunks = self.tagger.tag(tagged_sent)
 
    # Transform the result from [((w1, t1), iob1), ...] 
    # to the preferred list of triplets format [(w1, t1, iob1), ...]
    iob_triplets = [(w, t, c) for ((w, t), c) in chunks]
    # iob_triplets = [(w, t, 'O') for ((w, t), c) in chunks]
 
    # Transform the list of triplets to nltk.Tree format
    return conlltags2tree(iob_triplets)
Exemplo n.º 4
0
def question3():
    tagger = CRFTagger(feature_func=feature_func)

    tagger.train(train_sentences, 'model.crf.tagger')

    print(tagger.evaluate(test_sentences))
    return
Exemplo n.º 5
0
def cltk_pos_cv(full_training_set, local_dir_rel, counter):
    local_dir = os.path.expanduser(local_dir_rel)

    stdout_old = sys.stdout

    sys.stdout = open(os.path.join(local_dir, 'test_%d.out' % counter), 'w')

    # read POS corpora
    print("local_dir", local_dir)
    train_reader = TaggedCorpusReader(local_dir, 'train_%d.pos' % counter)
    train_sents = train_reader.tagged_sents()

    test_reader = TaggedCorpusReader(local_dir, 'test_%d.pos' % counter)
    test_sents = test_reader.tagged_sents()

    print('Loop #' + str(counter))

    sys.stdout.flush()

    # make crf tagger
    crf_tagger = CRFTagger()
    crf_tagger.train(train_sents, 'model.crf.tagger')

    #crf_tagger = UnigramTagger(train_sents)

    # evaluate crf tagger
    crf_accuracy = None
    crf_accuracy = crf_tagger.evaluate(test_sents)
    print('crf:', crf_accuracy)

    sys.stdout = stdout_old
Exemplo n.º 6
0
def run_crf(trainfile, testfile, model_file=None):

    maxlen = 100
    sents_train, tags_train, unique_words_train, unique_tags_train = \
        P.retrieve_sentences_tags(trainfile, maxlen=maxlen)
    sents_test, tags_test, unique_word_test, unique_tags_test = \
        P.retrieve_sentences_tags(testfile, maxlen=maxlen, allowedtags=unique_tags_train)

    train_data = []
    for n, st in enumerate(sents_train):
        s = []
        for m, _ in enumerate(st):
            s.append((unicode(sents_train[n][m], "utf-8")
                      , unicode(tags_train[n][m], "utf-8")))
        train_data.append(s)

    crf = CRFTagger()
    if model_file is None:
        crf.train(train_data, model_file='data/crf.mdl')
    else:
        crf.set_model_file(model_file)

    test_data = []
    for n, st in enumerate(sents_test):
        s = []
        for m, _ in enumerate(st):
            s.append((unicode(sents_test[n][m], "utf-8")
                      , unicode(tags_test[n][m], "utf-8")))
        test_data.append(s)

    print(crf.evaluate(test_data))
Exemplo n.º 7
0
def tagSentences(path, training_list=[], testing_list=[]):
    ct = CRFTagger()
    train_list = getTrainList(training_list)
    ct.train(train_list, 'model.crf.tagger')
    sentences = getSentences(path, testing_list)
    tagged_sentences = ct.tag_sents(sentences)
    return tagged_sentences
Exemplo n.º 8
0
def train_taggers():
    train_sents = load_pkl('train_sents')

    # instantiate taggers
    unigram_tagger = nltk.UnigramTagger(train_sents)
    tnt_tagger = tnt.TnT()
    perceptron_tagger = perceptron.PerceptronTagger(load=False)
    # limit the number of iteractions as the training takes too long
    crf_tagger = CRFTagger(training_opt={'max_iterations': 100})

    print('Unigram tagger has already been trained')
    save_pkl(unigram_tagger, 'unigram-tagger')

    print('training TnT tagger ...', end='', flush=True)
    tnt_tagger.train(train_sents)
    print('Done')
    save_pkl(tnt_tagger, 'tnt-tagger')

    print('training Perceptron tagger ...', end='', flush=True)
    perceptron_tagger.train(train_sents)
    print('Done')
    save_pkl(perceptron_tagger, 'perceptron-tagger')

    print('training CRF tagger ...', end='', flush=True)
    crf_tagger.train(train_sents, 'crf-tagger.model')
    print('Done')
Exemplo n.º 9
0
class CRF:
    def __init__(self):
        self.__model = type('test', (object,), {})()
        pass

    def train(self, X_training_data):
        self.__model = CRFTagger()
        self.__model.train(X_training_data, 'crf.model')
        pass

    def test(self, X_test_data):

        total = 0
        correct = 0
        for kalimat in X_test_data:
            temp = []
            for word in kalimat:
                temp.append(word[0])

            if len(temp) != 0:
                predicted_y = self.__model.tag(temp)
                for i in range(len(predicted_y)):
                    total += 1
                    if predicted_y[i][1] == kalimat[i][1]:
                        correct += 1

        print(correct, total)
        print(correct / total)
    pass
Exemplo n.º 10
0
class SlotTaggingModel(object):

    def __init__(self, **argparams):
        self.train_data = argparams['train_data']
        if self.train_data is not None:
            assert isinstance(self.train_data, DataSetCSVagentActPred)
        self.model_folder = argparams['model_folder']
        self.model_fname = '{}/slotTagging.model'.format(self.model_folder)

    def train(self, verbose=True):
        assert self.train_data is not None, 'train_data is required.'
        print('\ttraining ...')
        # transform data
        instance_list = self._transform_data(self.train_data)
        userUtterTag_train_fname = '{}/userUtterTag_train.txt'.format(self.model_folder)
        writeUtterTag(instance_list, userUtterTag_train_fname)
        print('\ttrain_data={}'.format(userUtterTag_train_fname))
        # train model
        self.model = CRFTagger(verbose=verbose)
        self.model.train(instance_list, self.model_fname)
        print('\tmodel_fname={}'.format(self.model_fname))
        print('\tsaving model ...')

    def _transform_data(self, data):
        ''' convert textual utter and user tags into a list of lists that contain lists of (w, t) pairs
        '''
        userUtter_txt = data.userUtter_txt
        userTag_txt = data.userTag_txt
        instance_list = list()
        for words, tags in zip(userUtter_txt, userTag_txt):
            instance = [(word.strip(), tag.strip()) for word, tag in zip(words.decode('utf-8').strip().split(), tags.decode('utf-8').strip().split())]
            instance_list.append(instance)
        return instance_list

    def predict(self, test_data):
        '''return a list of lists, [[(w1, tag1), (w2, tag2), (w3, tag3)], [...], [...]]
        '''
        assert test_data is not None, 'test_data is required.'
        assert isinstance(test_data, DataSetCSVagentActPred)
        print('\tpredicting Slot Tags ...')
        # transform data
        instance_list = self._transform_data(test_data)
        userUtterTag_test_fname = '{}/userUtterTag_test.target'.format(self.model_folder)
        writeUtterTag(instance_list, userUtterTag_test_fname)
        print('\ttag_target={}'.format(userUtterTag_test_fname))
        instance_utter_list = getUtterList(instance_list)
        # testing
        results = self.model.tag_sents(instance_utter_list)
        self.result_fname = '{}/userUtterTag_test.pred'.format(self.model_folder)
        print('\ttag_pred={}'.format(self.result_fname))
        writeUtterTag(results, self.result_fname)
        precision, recall, fscore, accuracy_frame = eval_tagPredBaseline(instance_list, results, test_data.userTag2id, test_data.userTag_vocab_size)
        print('\tprecision={:.4f}, recall={:.4f}, fscore={:.4f}, accuracy_frame={:.4f}'.format(precision, recall, fscore, accuracy_frame))
        return results

    def load_model(self, verbose=True):
        print('\tloading model ...')
        self.model = CRFTagger(verbose=verbose)
        self.model.set_model_file(self.model_fname)
Exemplo n.º 11
0
def question3():

    tagger = CRFTagger(feature_func=feature_func)
    tagger.train(train_sentences, 'model_windows_size_1.crf.tagger')

    #tagger = CRFTagger(feature_func=feature_func)
    #tagger.set_model_file('model_windows_size_1.crf.tagger')

    print(tagger.evaluate(test_sentences))
    return
Exemplo n.º 12
0
def main(no_stopwords, use_manual_train_set):

	print "MAINTAIN COMMON WORDS: " + str(not no_stopwords)
	print "USING HAND LABELED TRAIN DATA: " + str(use_manual_train_set)

	full_set = get_domain_set(no_stopwords)
	if not no_stopwords:
		full_set.extend(get_other_set())

	train_set, test_set_auto = divide_sets(full_set, 0.75)
	set_manual = get_manual_set(no_stopwords)

	train_set_manual = []
	test_set_manual = []
	if use_manual_train_set:
		train_set_manual, test_set_manual = divide_sets(set_manual, 0.28)
		train_set.extend(train_set_manual)
	else:
		test_set_manual = set_manual

	tagger = CRFTagger(feature_func=feature_extraction)
	try:
		tagger.train(train_set, 'laptop.crf.tagger')
	except ValueError:
		fi = open('DEBUG', 'w')
		for li in DEBUG:
			fi.write(str(li.encode('utf-8')) + '\n')
		fi.close()

	print "AUTOMATIC LABELED TEST"
	tagged_sents_auto = tagger.tag_sents(map_test_set(test_set_auto, word=True))
	predicted_auto = create_vector_of_predicted_labels(tagged_sents_auto)
	golden_auto = create_vector_of_predicted_labels(test_set_auto)

	print calculate_micro_accuracy(predicted_auto, golden_auto, no_stopwords)

	print "MANUAL LABELED TEST"
	tagged_sents_manual = tagger.tag_sents(map_test_set(test_set_manual, word=True))
	predicted_manual = create_vector_of_predicted_labels(tagged_sents_manual)
	golden_manual = create_vector_of_predicted_labels(test_set_manual)
	
	print calculate_micro_accuracy(predicted_manual, golden_manual, no_stopwords)
	print ""
Exemplo n.º 13
0
def main():
    import pickle
    from nltk.tag import CRFTagger
    infolist = pickle.load(open('infolist.pickle', 'rb'))

    ct = CRFTagger()

    train_data = [[(x, z)
                   for [x, y, z] in infolist[:round(0.9 * len(infolist))]]]

    ct.train(train_data, 'model.crf.tagger')
    ners = ct.tag_sents(
        [[x for [x, y, z] in infolist[round(0.9 * len(infolist)):]]])
    print(ners)

    gold_sentences = [[(x, z)
                       for [x, y, z] in infolist[round(0.9 * len(infolist)):]]]

    ct.evaluate(gold_sentences)
    print(ct.evaluate(gold_sentences))
Exemplo n.º 14
0
def load(training, testing):
    ct = CRFTagger()
    # split the training into sentences
    t = "\n".join(training)
    sents = t.split("###/###")
    # split the sentences into tokens
    train = []
    for sent in sents:
        if sent:
            new = []
            words = sent.split("\n")
            for word in words:
                if word:
                    # split the tokens into word and tag
                    new.append(tuple(word.split("/")))
            train.append(new)
    # remove any blank sentences that have been added
    for t in train:
        if not t:
            train.remove(t)
    ct.train(train, 'model.crf.tagger')
    # test on the testing data
    s = "\n".join(testing)
    s_sents = s.split("###/###")
    test = []
    sent_tags = []
    for t in s_sents:
        if t:
            new = []
            right_tags = []
            words = t.split("\n")
            for word in words:
                if word:
                    # split the tokens into just words
                    new.append(word.split("/")[0])
                    # save the tags in a list to be used later
                    right_tags.append(word.split("/")[1])
            sent_tags.append(right_tags)
            test.append(new)
    tags = ct.tag_sents(test)
    return tags, sent_tags
Exemplo n.º 15
0
    def fit(self, data):
        """
        Fits a tagging model to object's data based on object's tagger name
        :return: a tagger object
        """
        tagger = None
        self.X = data

        if self.tagger_type == 'hmm':
            # Setup a trainer with default(None) values
            # And train with the data
            trainer = hmm.HiddenMarkovModelTrainer()
            tagger = trainer.train_supervised(data)

        elif self.tagger_type == 'crf':
            trainer = CRFTagger()
            trainer.train(self.train_data, 'model.crf.tagger')
            tagger = trainer

        self.tagger = tagger

        return tagger
Exemplo n.º 16
0
def crf_tag():
    news_text = brown.tagged_sents(categories='news')
    train_sents = news_text[:3230]
    test_sents = news_text[3230:4600]
    ct = CRFTagger()
    tagger = ct.train(train_sents, 'model.crf.tagger')
    test = ct.evaluate(test_sents)
    print test
    sent3 = "Narendra Modi won Lok Sabha election with massive majority after long years".decode(
        'utf-8')
    sent_w = sent3.lower().split()
    print sent_w
    tag = ct.tag(sent_w)
    print "The Tag Is:", tag
Exemplo n.º 17
0
def ontweetdata():
    tweetinfolist = pickle.load(open('tweetinfolist.pickle',
                                     'rb'))  #data from tweets
    counter = 0
    for item in tweetinfolist:
        if item[1] == "O":
            counter = counter + 1
    print("BASELINE: ", (counter) / len(tweetinfolist))

    ct = CRFTagger()
    train_data = [[
        (x.lower(), y.lower())
        for [x, y] in tweetinfolist[round(0.9 * len(tweetinfolist)):]
    ]]
    ct.set_model_file('model.crf.tagger')
    ct.train(train_data, 'model.crf.tagger')

    gold_sentences = [[
        (x.lower(), y.lower())
        for [x, y] in tweetinfolist[:round(0.9 * len(tweetinfolist))]
    ]]

    print(ct.evaluate(gold_sentences))
Exemplo n.º 18
0
entrenar_bill(tagger,"UnigramTagger")


# In[ ]:


tagger = BigramTagger(train_reducido[:1000])
tagger.evaluate(test_reducido[:1000])
entrenar_bill(tagger,"BigramTagger")


# In[ ]:


ct = CRFTagger()
ct.train(train_reducido[:1000],'model.crf.tagger')
evaluacion = ct.evaluate(test_reducido[:1000])
xlabels.append("CRF Tagger")
accuracys.append(evaluacion)


# In[ ]:


tagger = PerceptronTagger(load=False)
tagger.train(train_reducido[:1000])
evaluacion = tagger.evaluate(test_reducido[:1000])
xlabels.append("Perceptron Tagger")
accuracys.append(evaluacion)

Exemplo n.º 19
0
def cltk_pos_cv(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    crf_accuracies = []
    
    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i+n]

    # a list of 10 lists
    ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = ten_parts[counter]  # or: test_set = part
        
        # filter out this loop's test index
        training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
        
        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [item for sublist in training_set_lists for item in sublist]
            
        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test.pos')
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train.pos')
        with open(train_path, 'w') as f:
            f.write('\n\n'.join(training_set))

        # read POS corpora
        print("local_dir", local_dir)
        train_reader = TaggedCorpusReader(local_dir, 'train.pos')
        train_sents = train_reader.tagged_sents()

        test_reader = TaggedCorpusReader(local_dir, 'test.pos')
        test_sents = test_reader.tagged_sents()
        
        print('Loop #' + str(counter))
        # make crf tagger
        crf_tagger = CRFTagger()
        crf_tagger.train(train_sents, 'model.crf.tagger')
        
        # evaluate crf tagger
        crf_accuracy = None
        crf_accuracy = crf_tagger.evaluate(test_sents)
        crf_accuracies.append(crf_accuracy)
        print('crf:', crf_accuracy)

        #if counter> 0: break
        
    final_accuracies_list = []
    mean_accuracy_crf = mean(crf_accuracies)
    standard_deviation_crf = stdev(crf_accuracies)
    uni = {'crf': {'mean': mean_accuracy_crf, 'sd': standard_deviation_crf}}
    final_accuracies_list.append(uni)

    final_dict = {}
    for x in final_accuracies_list:
        final_dict.update(x)
    
    return final_dict
Exemplo n.º 20
0
from nltk.tag import CRFTagger

_model_file = 'model.crf.tagger_2'


def get_data(ifile='vtb.txt'):
    data = []
    with open(ifile, encoding='utf-8') as vtb:
        for _ in range(10383):
            temp = []
            a = vtb.readline().split()
            for i in a:
                if i == '/':
                    i = ['/', '/']
                    temp.append(tuple(i))
                else:
                    temp.append(tuple(i.split('/')))
            data.append(temp)
    return data


data = get_data()

# with open('vtb3.txt','w',encoding='utf-8') as vtb3:
#     vtb3.write(str(data))

ct = CRFTagger()
ct.train(data, _model_file)
Exemplo n.º 21
0
def create_trainingModel(train_data,ModelPath):
    if os.path.isfile(ModelPath):
        os.remove(ModelPath)
    ct = CRFTagger()
    ct.train(train_data,ModelPath)
Exemplo n.º 22
0
    #print "\nrequiredFormat = ",requiredFormat
    return requiredFormat


print "\nReading training corpus...."
ListOfSentences_Training = corpusRead(Training_Data)
print "Reading test corpus...."  
ListOfSentences_Test = corpusRead(Test_Data)




#CRF Training
ct = CRFTagger()
print "CRF Training starts..."
ct.train(ListOfSentences_Training,'model.crf.tagger')
print "CRF Training is done."

print "Testing starts"
print "Accuracy of CRF is = ",ct.evaluate(ListOfSentences_Test) * 100
#Tagging by CRF Tagger
ch = 'y'
while (ch != 'n'):
    text = raw_input("Enter the text to be tagged : \n")
    text = converter(text)
    print ct.tag_sents(text)
    print "\nDo you want to continue ?"
    ch = raw_input()
 

Exemplo n.º 23
0
def train_crf_tagger(train_corpus: Corpus, model_file='crfmodel'):
    train_sents = gen_tagged_sents(train_corpus)
    crf_tagger = CRFTagger()
    crf_tagger.train(train_sents, model_file)
    return crf_tagger
from nltk.tag import CRFTagger
from nltk.corpus import brown

ct = CRFTagger()

brown_tagged_sents = brown.tagged_sents(tagset='universal')
size = int(len(brown_tagged_sents) * 0.7)

train_sents = brown_tagged_sents[:size]
ct.train(train_sents, 'model.crf.tagger')

#brown_sents = brown.sents()
#test_sents = brown_sents[size:]

#print(ct.tag(test_sents))
Exemplo n.º 25
0
from nltk.tag import CRFTagger
from nltk.corpus import brown
import pickle
#from tag_utils import *

tagged_sents = brown.tagged_sents()
train = tagged_sents[:50000]
test = tagged_sents[50000:]
crf = CRFTagger()

'''
############# Train #############
crf.train(train, 'crf_brown.tagger')
print crf.evaluate(test) # 0.954383534534
'''

############# Test #############
crf.set_model_file('crf_brown.tagger')
tokens = []
for i in test:
	for j in i:
		tokens.append(j[0])
 
test_tagged = crf.tag(tokens)

'''
f = open("test_tagged_obj.pickle", 'w')
pickle.dump(test_tagged, f)
f.close()
'''
Exemplo n.º 26
0
class DataAdapter(object):
    def __init__(self, data=[]):
        self.tagger = CRFTagger()
        self.tagger.set_model_file('model.crf.tagger')

        if data.count(True) > 0:
            self.data_tagging, self.data_testing = self.for_tagging_testing(
                data)
            # print('TAGGING', self.data_tagging)
            # print('TESTING', self.data_testing)

    def tokenize_tag(self, text):
        text = text.replace('\r', ' | ').replace('\n', ' | ')
        tokens = word_tokenize(text, preserve_line=True)
        labels = []
        for label in self.tag(tokens):
            labels.append(label[1])
        return tokens, labels

    def for_tagging_testing(self, data):
        # self.data = data
        array_tagging = []
        array_testing = []
        for d in data:
            all_tags = []
            all_test = []
            for index, t in enumerate(d['text']):
                one_tag = [t, d['label'][index]]
                all_test.append(one_tag)
                all_tags.append(t)
            array_tagging.append(all_tags)
            array_testing.append(all_test)
            # print(all_tags)
        return array_tagging, array_testing

    def for_testing(self, data):
        # self.data = data
        array = []
        # print('TEST', data.count())
        for d in data:
            all_tags = []
            for index, t in enumerate(d['text']):
                # one_tag = [t, (d['label'][index] if is_ascii(d['label'][index]) else 'O')]
                one_tag = [t, d['label'][index]]
                all_tags.append(one_tag)
            array.append(all_tags)
            # print(all_tags)
        return array

    def for_tagging(self, data):
        # self.data = data
        array = []
        for d in data:
            all_tags = []
            for t in d['text']:
                all_tags.append(t)
            array.append(all_tags)
            # print(all_tags)
        return array

    def tag_sents(self):
        if self.data_tagging is not None:
            return self.tagger.tag_sents(self.data_tagging)
        else:
            return 'NoData'

    def tag(self, data):
        return self.tagger.tag(data)

    def evaluate(self):
        if self.data_testing is not None:
            return self.tagger.evaluate(self.data_testing)
        else:
            return 'NoData'

    def train(self, data):
        data = self.for_testing(data)
        self.tagger.train(data, 'model.crf.tagger')
        print('ACCURACY:', self.tagger.evaluate(data))
Exemplo n.º 27
0
comb_results = np.zeros((5, 4))
ind_results = np.zeros((5, 4))
for ki in range(data_batch):
    from nltk import TnT
    from nltk.tag import hmm
    from nltk.tag.perceptron import PerceptronTagger
    from nltk.tag import CRFTagger

    perc_tagger = PerceptronTagger(load=False)
    tnt_tagger = TnT()
    crf_tagger = CRFTagger()

    tnt_tagger.train(training_data[ki])
    hmm_tagger = nltk.HiddenMarkovModelTagger.train(training_data[ki])
    perc_tagger.train(training_data[ki])
    crf_tagger.train(training_data[ki], 'model.crf.tagger')

    # t.tagdata(test_data[800:])

    perc_pred = []
    hmm_pred = []

    for i in testing_data[ki]:
        perc_pred.append(perc_tagger.tag(i))
        hmm_pred.append(hmm_tagger.tag(i))
    crf_pred = crf_tagger.tag_sents(testing_data[ki])
    tnt_pred = tnt_tagger.tagdata(testing_data[ki])
    pred = {'p': perc_pred, 'h': hmm_pred, 'c': crf_pred, 't': tnt_pred}

    def most_frequent(List):
        return max(set(List), key=List.count)
Exemplo n.º 28
0
    line_list=[]
    while line:
        #print(line)
        words=line.replace("\r","").replace("\n","").split("\t")
        #print(words)
        if(len(words)<2):
            train_data.append(line_list)
            line_list=[]
        else:
            tup1=(words[0],words[1])
            line_list.append(tup1)
        line=f.readline()
    f.close()
ct = CRFTagger()

ct.train(train_data,'model.crf.tagger')


test_actual=[]
test_sentences=[]
#with codecs.open("nepali-english-demo-20%training-data.txt","r","utf-8") as f:
with codecs.open("/Users/Preethi/nlp_project/EMNLP/spanish_english/training/spanish-english-training-20%.txt","r","utf-8") as f:
#with codecs.open("/Users/Preethi/nlp_project/EMNLP/mandarin_english/training/mandarin-english-testing-answers.txt","r","utf-8") as f:
    line=f.readline()
    test=[]
    sentence=[]
    while line:
        words=line.replace("\r","").replace("\n","").split("\t")
        #print(words)
        if(len(words)<2):
            test_actual.append(test)
Exemplo n.º 29
0
class SimpleSLU:
    def __init__(self):
        self.__semantic_instance_list = []
        self.__speech_act_instance_list = []

        self.__semantic_model = None
        self.__speech_act_model = None

        self.__speech_act_lb = None

    def load_model(self, modelfile):
        with open('%s.act.model' % modelfile, 'r') as f:
            self.__speech_act_model, self.__speech_act_lb = pickle.load(f)

        self.__semantic_model = CRFTagger(verbose=True)
        self.__semantic_model.set_model_file('%s.semantic.model' % modelfile)

        return True

    def add_instance(self, utter, speech_act, semantic_tagged):
        tokenized = self.__tokenize(utter, semantic_tagged)
        if tokenized is None:
            return False

        semantic_instance = []
        for word, (bio, tag, attrs) in tokenized:
            if bio is None:
                sem_label = 'O'
            else:
                cat = None
                for attr, val in attrs:
                    if attr == 'cat':
                        cat = val
                sem_label = '%s-%s_%s' % (bio, tag, cat)
            semantic_instance.append((unicode(word.lower()), unicode(sem_label)))
        self.__semantic_instance_list.append(semantic_instance)

        sa_label_list = []
        for sa in speech_act:
            sa_labels = ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']]
            sa_label_list += sa_labels

        sa_label_list = sorted(set(sa_label_list))

        word_feats = ' '.join([word.lower() for word, _ in tokenized])
        self.__speech_act_instance_list.append((word_feats, sa_label_list))

        return True

    def train(self, modelfile):
        sa_feats = [x for x, _ in self.__speech_act_instance_list]
        sa_labels = [y for _, y in self.__speech_act_instance_list]

        self.__speech_act_lb = preprocessing.MultiLabelBinarizer()
        sa_labels = self.__speech_act_lb.fit_transform(sa_labels)

        self.__speech_act_model = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', OneVsRestClassifier(LinearSVC(verbose=True)))])

        self.__speech_act_model.fit(sa_feats, sa_labels)

        with open('%s.act.model' % modelfile, 'wb') as f:
            pickle.dump((self.__speech_act_model, self.__speech_act_lb), f)

        self.__semantic_model = CRFTagger(verbose=True)
        self.__semantic_model.train(self.__semantic_instance_list, '%s.semantic.model' % modelfile)

    def pred(self, utter):
        tokenized = self.__tokenize(utter)
        word_feats = ' '.join([word.lower() for word, _ in tokenized])

        pred_act = self.__speech_act_lb.inverse_transform(self.__speech_act_model.predict([word_feats]))
        pred_semantic = self.__semantic_model.tag([word.lower() for word, _ in tokenized])

        return (pred_act, pred_semantic)

    def __tokenize(self, utter, semantic_tagged=None):
        result = None
        if semantic_tagged is None:
            result = [(word, None) for word in nltk.word_tokenize(utter)]
        else:
            parser_raw = SemanticTagParser(False)
            parser_tagged = SemanticTagParser(False)

            segmented = ' '.join(nltk.word_tokenize(utter))
            tagged = ' '.join(semantic_tagged)

            parser_raw.feed(segmented)
            parser_tagged.feed(tagged)

            raw_chr_seq = parser_raw.get_chr_seq()
            raw_space_seq = parser_raw.get_chr_space_seq()

            tagged_chr_seq = parser_tagged.get_chr_seq()
            tagged_space_seq = parser_tagged.get_chr_space_seq()

            if raw_chr_seq == tagged_chr_seq:
                merged_space_seq = [
                    x or y for x, y in zip(raw_space_seq, tagged_space_seq)]

                word_seq = parser_tagged.tokenize(merged_space_seq)
                tag_seq = parser_tagged.get_word_tag_seq()

                result = [(word, tag) for word, tag in zip(word_seq, tag_seq)]

        return result
Exemplo n.º 30
0
from nltk.tag import CRFTagger
from nltk.corpus import brown

tagged_sents = brown.tagged_sents()
train = tagged_sents[:50000]
test = tagged_sents[50000:]

crf = CRFTagger()
crf.train(train, 'crf_tagger.model')
a = crf.evaluate(test)
print a
Exemplo n.º 31
0
def main():
    # start timer
    for item in [
            "UD_Ukrainian",
            "Brown",
    ]:

        print("in process " + item)
        # open Brown training data
        infile = open(DATA_PATH + item + "_tagged_train.txt",
                      "r",
                      encoding="utf-8")
        brown_train = infile.readlines()
        infile.close()

        # split words and tags, and add start and stop symbols (question 1)
        brown_words, brown_tags = split_wordtags(brown_train)

        # calculate tag trigram probabilities (question 2)
        q_values = calc_trigrams(brown_tags)

        # question 2 output
        q2_output(q_values, OUTPUT_PATH + item + '_B2.txt')

        # calculate list of words with count > 5 (question 3)
        known_words = calc_known(brown_words)

        # get a version of brown_words with rare words replace with '_RARE_' (question 3)
        brown_words_rare = replace_rare(brown_words, known_words)

        # question 3 output
        q3_output(brown_words_rare, OUTPUT_PATH + item + "_B3.txt")

        # calculate emission probabilities (question 4)
        e_values, taglist = calc_emission(brown_words_rare, brown_tags)

        # question 4 output
        q4_output(e_values, OUTPUT_PATH + item + "_B4.txt")

        # delete unneceessary data
        del brown_train
        del brown_words_rare

        # open Brown development data (question 5)
        infile = open(DATA_PATH + item + "_test.txt", "r")
        brown_dev = infile.readlines()
        infile.close()

        # format Brown development data here
        brown_dev_words = []
        for sentence in brown_dev:
            brown_dev_words.append(sentence.split(" ")[:-1])

        # do viterbi on brown_dev_words (question 5)
        viterbi_tagged = viterbi(brown_dev_words, taglist, known_words,
                                 q_values, e_values)

        # question 5 output
        q5_output(viterbi_tagged, OUTPUT_PATH + item + "_B5.txt")

        # # do nltk tagging here
        # nltk_tagged = nltk_tagger(brown_words, brown_tags, brown_dev_words)
        #
        # # question 6 output
        # q6_output(nltk_tagged, OUTPUT_PATH + item + "_B6.txt")

    for item in ["Brown", "UD_Ukrainian"]:
        print("in crf process " + item)
        # open Brown training data
        infile = open(DATA_PATH + item + "_tagged_train.txt",
                      "rb",
                      encoding="utf-8")
        brown_train = infile.readlines()
        infile.close()

        brown_words, brown_tags = split_wordtags(brown_train)
        train_words_tags = []
        ct = CRFTagger()
        for i in range(len(brown_words)):
            tmp = []
            for j in range(len(brown_words[i])):
                tmp.append((brown_words[i][j].decode('utf-8'),
                            brown_tags[i][j].decode('utf-8')))
            train_words_tags.append(tmp)

        ct.train(train_words_tags, u'model.crf.tagger')

        # open Brown development data (question 5)
        infile = open(DATA_PATH + item + "_test.txt", "r")
        brown_dev = infile.readlines()
        infile.close()

        # format Brown development data here
        tests_words = []
        for sentence in brown_dev:
            tests_words.append([i for i in sentence.split(" ")[:-1]])

        result_cfg = ct.tag_sents(tests_words)
        with open(OUTPUT_PATH + item + "_CFG.txt", "w") as file:
            for line in result_cfg:
                for word in line:
                    file.write(word[0] + "/" + word[1] + " ")
                file.write("\n")

        # print total time to run Part B
        print("Part B time: ", str(time.clock()), ' sec')
Exemplo n.º 32
0
from nltk.tag import CRFTagger

jumSample = 500000
namaFile = "Indonesian_Manually_Tagged_Corpus.tsv"
with open(namaFile, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

pasangan = []
allPasangan = []

for line in lines[:min(jumSample, len(lines))]:
    if line == '':
        allPasangan.append(pasangan)
        pasangan = []
    else:
        kata, tag = line.split('\t')
        p = (kata, tag)
        pasangan.append(p)

ct = CRFTagger()
ct.train(allPasangan, 'all_indo_man_tag_corpus_model.crf.tagger')
# test
hasil = ct.tag_sents([['Saya', 'bekerja', 'di', 'Bandung'],
                      ['Nama', 'saya', 'Yudi']])
print(hasil)
Exemplo n.º 33
0
class SimpleSLU:
    def __init__(self):
        self.__semantic_instance_list = []
        self.__speech_act_instance_list = []

        self.__semantic_model = None
        self.__speech_act_model = None

        self.__speech_act_lb = None

    def load_model(self, modelfile):
        with open('%s.act.model' % modelfile, 'r') as f:
            self.__speech_act_model, self.__speech_act_lb = pickle.load(f)

        self.__semantic_model = CRFTagger(verbose=True)
        self.__semantic_model.set_model_file('%s.semantic.model' % modelfile)

        return True

    def add_instance(self, utter, speech_act, semantic_tagged):
        tokenized = self.__tokenize(utter, semantic_tagged)
        if tokenized is None:
            return False

        semantic_instance = []
        for word, (bio, tag, attrs) in tokenized:
            if bio is None:
                sem_label = 'O'
            else:
                cat = None
                for attr, val in attrs:
                    if attr == 'cat':
                        cat = val
                sem_label = '%s-%s_%s' % (bio, tag, cat)
            semantic_instance.append((unicode(word.lower()), unicode(sem_label)))
        self.__semantic_instance_list.append(semantic_instance)

        sa_label_list = []
        for sa in speech_act:
            sa_labels = ['%s_%s' % (sa['act'], attr) for attr in sa['attributes']]
            sa_label_list += sa_labels

        sa_label_list = sorted(set(sa_label_list))

        word_feats = ' '.join([word.lower() for word, _ in tokenized])
        self.__speech_act_instance_list.append((word_feats, sa_label_list))

        return True

    def train(self, modelfile):
        sa_feats = [x for x, _ in self.__speech_act_instance_list]
        sa_labels = [y for _, y in self.__speech_act_instance_list]

        self.__speech_act_lb = preprocessing.LabelBinarizer()
        sa_labels = self.__speech_act_lb.fit_transform(sa_labels)

        self.__speech_act_model = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', OneVsRestClassifier(LinearSVC(verbose=True)))])

        self.__speech_act_model.fit(sa_feats, sa_labels)

        with open('%s.act.model' % modelfile, 'wb') as f:
            pickle.dump((self.__speech_act_model, self.__speech_act_lb), f)

        self.__semantic_model = CRFTagger(verbose=True)
        self.__semantic_model.train(self.__semantic_instance_list, '%s.semantic.model' % modelfile)

    def pred(self, utter):
        tokenized = self.__tokenize(utter)
        word_feats = ' '.join([word.lower() for word, _ in tokenized])

        pred_act = self.__speech_act_lb.inverse_transform(self.__speech_act_model.predict([word_feats]))
        pred_semantic = self.__semantic_model.tag([word.lower() for word, _ in tokenized])

        return (pred_act, pred_semantic)

    def __tokenize(self, utter, semantic_tagged=None):
        result = None
        if semantic_tagged is None:
            result = [(word, None) for word in nltk.word_tokenize(utter)]
        else:
            parser_raw = SemanticTagParser(False)
            parser_tagged = SemanticTagParser(False)

            segmented = ' '.join(nltk.word_tokenize(utter))
            tagged = ' '.join(semantic_tagged)

            parser_raw.feed(segmented)
            parser_tagged.feed(tagged)

            raw_chr_seq = parser_raw.get_chr_seq()
            raw_space_seq = parser_raw.get_chr_space_seq()

            tagged_chr_seq = parser_tagged.get_chr_seq()
            tagged_space_seq = parser_tagged.get_chr_space_seq()

            if raw_chr_seq == tagged_chr_seq:
                merged_space_seq = [
                    x or y for x, y in zip(raw_space_seq, tagged_space_seq)]

                word_seq = parser_tagged.tokenize(merged_space_seq)
                tag_seq = parser_tagged.get_word_tag_seq()

                result = [(word, tag) for word, tag in zip(word_seq, tag_seq)]

        return result
Exemplo n.º 34
0
class NamedEntityChunker(ChunkParserI):
    def __init__(self,
                 train_sents=None,
                 tagger="ClassifierBasedTagger",
                 model=None,
                 model_name="../results/modelCRF_featured",
                 entities=None,
                 language="english",
                 **kwargs):

        self.all_entities = []
        self.acronyms = []
        self.language = language

        if not model:
            assert isinstance(train_sents, Iterable)

        if tagger == "ClassifierBasedTagger":
            self.feature_detector = iob_features
            self.tagger = ClassifierBasedTagger(train=train_sents,
                                                feature_detector=iob_features,
                                                **kwargs)

        elif tagger == "CRFTagger":
            self.set_entities(entities)
            if not model:

                self.tagger = CRFTagger(feature_func=self.crf_features)
                self.tagger.train(
                    train_data=train_sents,
                    model_file="../results/{}".format(model_name))
            else:
                self.tagger = CRFTagger(feature_func=self.crf_features)
                self.tagger.set_model_file(model)
        else:
            raise Exception('Unknown tagger')

    def parse(self, tagged_sent):
        chunks = self.tagger.tag(tagged_sent)

        return chunks

    def get_position(self, w):
        positions = []
        for e in self.all_entities:
            if w in e:
                positions.append(e.index(w))
        return positions

    def get_positions(self, tokens, index):
        w = tokens[index][0]
        prev = tokens[index - 1][0]
        next = tokens[index + 1][0]
        positions = []
        for e in self.all_entities:
            if w in e and prev in e and next in e:
                positions.append(e.index(w))
        return list(set(positions))

    def set_entities(self, entities):
        if entities:

            entities = [l.split() for l in entities]

            for l in entities:
                if len(l) == 1 and is_all_caps(l[0]):
                    self.acronyms.append(l[0].lower())
                else:
                    self.all_entities.append([w.lower() for w in l])

            self.all_entities = list(
                set([tuple(entity) for entity in self.all_entities]))
            self.acronyms = list(set(self.acronyms))

            with open('../data/entities_{}.txt'.format(self.language),
                      'w') as f:
                f.write("\n".join(
                    [" ".join(line) for line in self.all_entities]))

            with open('../data/acronyms_{}.txt'.format(self.language),
                      'w') as f:
                f.write("\n".join(
                    [" ".join(line) for line in self.all_entities]))
        else:
            with open('../data/entities_{}.txt'.format(self.language),
                      'r') as f:
                for line in f:
                    self.all_entities.append(line.strip().split())

            with open('../data/acronyms_{}.txt'.format(self.language),
                      'r') as f:
                for line in f:
                    self.acronyms.append(line.strip())

        self.all_entities = list(
            set([tuple(entity) for entity in self.all_entities]))
        self.acronyms = list(set(self.acronyms))

    def crf_features(self, tokens, index):
        """
        `tokens`  = a POS-tagged sentence [(w1, t1), ...]
        `index`   = the index of the token we want to extract features for
        """

        # init the stemmer
        stemmer = SnowballStemmer(self.language)

        # Pad the sequence with
        num_of_previous = 3
        num_of_posterior = 2
        tk = []
        for i in range(0, num_of_previous):
            tk.append(('[START{}]'.format(num_of_previous - i),
                       '[START{}]'.format(num_of_previous - i)))

        tk = tk + list(tokens)
        for i in range(1, num_of_posterior + 1):
            tk.append(('[END{}]'.format(i), '[END{}]'.format(i)))

        tokens = tk

        index += num_of_previous

        word, pos = tokens[index]

        contains_dash = ('–' in word or '-' in word or '_' in word)
        contains_dot = '.' in word

        prev2_words = tokens[index - 2][0] + "_._" + tokens[index - 1][0]
        prev2_pos = tokens[index - 2][1] + "_._" + tokens[index - 1][1]

        prev1_words = tokens[index - 1][0] + "_._" + tokens[index][0]
        prev1_pos = tokens[index - 1][1] + "_._" + tokens[index][1]
        prev1_lemma = stemmer.stem(
            tokens[index - 1][0]) + "_._" + stemmer.stem(tokens[index][0])

        next1_words = tokens[index][0] + "_._" + tokens[index + 1][0]
        next1_pos = tokens[index][1] + "_._" + tokens[index + 1][1]

        next2_words = tokens[index + 1][0] + "_._" + tokens[index + 2][0]
        next2_pos = tokens[index + 1][1] + "_._" + tokens[index + 2][1]

        allcaps = is_all_caps(word)
        strange_cap = word[
            0] not in string.ascii_uppercase and word != word.lower()

        inside_ent = word.lower() in self.all_entities
        is_acronym = word.lower() in self.acronyms
        features = {
            'word': word,
            'lemma': stemmer.stem(word),
            'pos': pos,
            'all-caps': allcaps,
            'strange-cap': strange_cap,
            'prev2-pos': prev2_pos,
            'prev2-word': prev2_words,
            'next2-pos': next2_pos,
            'next2-word': next2_words,
            'prev1-pos': prev1_pos,
            'prev1-word': prev1_words,
            'prev1-lemma': prev1_lemma,
            'next1-pos': next1_pos,
            'next1-word': next1_words,
        }

        features['inside-entities'] = inside_ent
        if is_acronym:
            features['is-acronym'] = is_acronym

        positions = self.get_position(word.lower())
        for p in positions:
            features['position-{}'.format(p)] = True
        features['total-position-{}'.format(len(positions))] = True

        if contains_dash:
            features['contains-dash'] = contains_dash
        if contains_dot:
            features['contains-dot'] = contains_dot

        for i in range(1, num_of_previous + 1):
            word, pos = tokens[index - i]
            lemma = stemmer.stem(word)

            features['prev-{}-word'.format(i)] = word
            features['prev-{}-pos'.format(i)] = pos

            features['prev-{}-lemma'.format(i)] = lemma

        for i in range(1, num_of_posterior + 1):
            word, pos = tokens[index + i]
            inside_ent = word.lower() in self.all_entities

            features['next-{}-word'.format(i)] = word
            features['next-{}-pos'.format(i)] = pos
            features['next-{}-inside-ent'.format(i)] = inside_ent

        return features

# Divide data into train and test sets
eightyPercent = count * 0.9
training_set = data[0:int(eightyPercent)]
test_set = data[int(eightyPercent):]
# print training_set

# Train
ct = CRFTagger()
train_data = training_set
train_data_new = []
for i in range(len(train_data)):
    if len(train_data[i]) != 0:
        train_data_new.append(train_data[i])
ct.train(train_data_new, 'model.crf.tagger')

# Accuracy
test_data_new = []
test_data_tags = []
for i in range(len(test_set)):
    if len(test_set[i]) != 0:
        for j in range(len(test_set[i])):
            test_data_new.append(test_set[i][j][0])
            test_data_tags.append(test_set[i][j][1])
gold_sentences = test_data_new
# print ct.evaluate(gold_sentences)

# print test_data_new
pred_tags = []
refsets = collections.defaultdict(set)
Exemplo n.º 36
0
from nltk.corpus import treebank
from nltk.tag import tnt, CRFTagger


# split training data from test data
train_data = treebank.tagged_sents()[:3000]
test_data = treebank.tagged_sents()[3000:]

# train a trigram N tagger (TnT)
tnt_pos_tagger = tnt.TnT()
tnt_pos_tagger.train(train_data)
print tnt_pos_tagger.evaluate(test_data)

# train a CRF tagger
crf_tagger = CRFTagger()
crf_tagger.train(train_data,
                 '~/Documents/NLP/NLP/crf_model.txt')
print crf_tagger.evaluate(test_data)