Exemplo n.º 1
0
def ExtractItemsFromJudgment(text,CodeTaggerFile,TitleTaggerFile):
        text = removeHTMLTags(text)
        tokenList = tokenizeTestData(text) 
        CodesTagger = CRFTagger()
        TitleTagger = CRFTagger()
        
        CodesTagger.set_model_file(CodeTaggerFile)
        TitleTagger.set_model_file(TitleTaggerFile)
      
        taggedCodes =  CodesTagger.tag_sents(tokenList)
        taggedTitles = TitleTagger.tag_sents(tokenList)
        
        return extract_entities(taggedCodes,taggedTitles)
Exemplo n.º 2
0
def ExtractItemsFromJudgment(text):

        text = removeHTMLTags(text)
        
        tokenList = tokenizeTestData(text) 
        CodesTagger = CRFTagger()
        titleTagger = CRFTagger()
        
        CodesTagger.set_model_file("models/CRF-Model-OnlyCodes")
        titleTagger.set_model_file("models/CRF-Model-OnlyTitles")
      
        taggedCodes =  CodesTagger.tag_sents(tokenList)
        taggedTitles = titleTagger.tag_sents(tokenList)
       
        return extract_entities(taggedCodes,taggedTitles)
Exemplo n.º 3
0
def tagpos(request):
    ct = CRFTagger()
    ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')
    tokenize = word_tokenize("Saya bekerja di Bandung")
    hasil = ct.tag_sents([tokenize])
    postag = nltk.pos_tag(tokenize)
    context = {
        'tokenize': tokenize,
        'postag': postag,
        'hasil': hasil,
    }
    template = loader.get_template('polls/tagged.html')
    # train_text = state_union.raw('2005-GWBush.txt')
    # sample_text = state_union.raw('2006-GWBush.txt')
    # custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
    # tokenized = custom_sent_tokenizer.tokenize(sample_text)
    # tagged = []
    # for i in tokenized[:5]:
    #     words = nltk.word_tokenize(i)
    #     tagged.append(nltk.pos_tag(words))
    #
    # template = loader.get_template('polls/tagged.html')
    # context = {
    #     'tagged' : tagged
    # }
    return HttpResponse(template.render(context, request))
Exemplo n.º 4
0
def tagSentences(path, training_list=[], testing_list=[]):
    ct = CRFTagger()
    train_list = getTrainList(training_list)
    ct.train(train_list, 'model.crf.tagger')
    sentences = getSentences(path, testing_list)
    tagged_sentences = ct.tag_sents(sentences)
    return tagged_sentences
Exemplo n.º 5
0
    def main(self):
        # metode SENDIRI
        file = open("forecast_corpus.txt", "r")
        call = file.read()
        corpus = call.split()
        file.close()
        verba = []

        # stopword removal
        # sfactory = StopWordRemoverFactory()
        # stopwords = sfactory.create_stop_word_remover()
        # stop = stopwords.remove(call)
        # c = stop.split()

        # print("Membaca corpus.....")
        ct = CRFTagger()
        ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')
        hasil = ct.tag_sents([corpus])

        this = Verba_finder()
        for x in range(len(hasil[0])):
            if hasil[0][x][1] == 'VB' and this.afiks_check(
                    hasil[0][x][0]) == 1 and (hasil[0][x + 1][1] == 'NN'
                                              or hasil[0][x + 1][1] == 'JJ'):
                # print(hasil[0][x])
                verba.append(" " + hasil[0][x][0] + " ")

        return verba
Exemplo n.º 6
0
def function_pos_tagging(new_stopwords_tweets):
    ct = CRFTagger()
    ct.set_model_file('data/all_indo_man_tag_corpus_model.crf.tagger')
    new_pos_tweets = []
    for n in range(len(new_stopwords_tweets)):
        pos_tweet_word = [new_stopwords_tweets[n][0]]
        pos_tweet_words = ct.tag_sents(pos_tweet_word)
        pos_tweet = [pos_tweet_words, new_stopwords_tweets[n][1]]
        new_pos_tweets.append(pos_tweet)

    new_features_tweets = []
    for n in range(len(new_pos_tweets)):
        pos_tweets_data = new_pos_tweets[n][0][0]
        features = []
        for tokenTag in pos_tweets_data:
            token, tag = tokenTag
            access = ['NN', 'JJ', 'RB', 'VBD']
            if tag in access:
                features.append(token)
            else:
                pass

        if features:
            features_tweets = [features, new_pos_tweets[n][1]]
            new_features_tweets.append(features_tweets)
        else:
            pass
    return new_features_tweets
Exemplo n.º 7
0
class SlotTaggingModel(object):

    def __init__(self, **argparams):
        self.train_data = argparams['train_data']
        if self.train_data is not None:
            assert isinstance(self.train_data, DataSetCSVagentActPred)
        self.model_folder = argparams['model_folder']
        self.model_fname = '{}/slotTagging.model'.format(self.model_folder)

    def train(self, verbose=True):
        assert self.train_data is not None, 'train_data is required.'
        print('\ttraining ...')
        # transform data
        instance_list = self._transform_data(self.train_data)
        userUtterTag_train_fname = '{}/userUtterTag_train.txt'.format(self.model_folder)
        writeUtterTag(instance_list, userUtterTag_train_fname)
        print('\ttrain_data={}'.format(userUtterTag_train_fname))
        # train model
        self.model = CRFTagger(verbose=verbose)
        self.model.train(instance_list, self.model_fname)
        print('\tmodel_fname={}'.format(self.model_fname))
        print('\tsaving model ...')

    def _transform_data(self, data):
        ''' convert textual utter and user tags into a list of lists that contain lists of (w, t) pairs
        '''
        userUtter_txt = data.userUtter_txt
        userTag_txt = data.userTag_txt
        instance_list = list()
        for words, tags in zip(userUtter_txt, userTag_txt):
            instance = [(word.strip(), tag.strip()) for word, tag in zip(words.decode('utf-8').strip().split(), tags.decode('utf-8').strip().split())]
            instance_list.append(instance)
        return instance_list

    def predict(self, test_data):
        '''return a list of lists, [[(w1, tag1), (w2, tag2), (w3, tag3)], [...], [...]]
        '''
        assert test_data is not None, 'test_data is required.'
        assert isinstance(test_data, DataSetCSVagentActPred)
        print('\tpredicting Slot Tags ...')
        # transform data
        instance_list = self._transform_data(test_data)
        userUtterTag_test_fname = '{}/userUtterTag_test.target'.format(self.model_folder)
        writeUtterTag(instance_list, userUtterTag_test_fname)
        print('\ttag_target={}'.format(userUtterTag_test_fname))
        instance_utter_list = getUtterList(instance_list)
        # testing
        results = self.model.tag_sents(instance_utter_list)
        self.result_fname = '{}/userUtterTag_test.pred'.format(self.model_folder)
        print('\ttag_pred={}'.format(self.result_fname))
        writeUtterTag(results, self.result_fname)
        precision, recall, fscore, accuracy_frame = eval_tagPredBaseline(instance_list, results, test_data.userTag2id, test_data.userTag_vocab_size)
        print('\tprecision={:.4f}, recall={:.4f}, fscore={:.4f}, accuracy_frame={:.4f}'.format(precision, recall, fscore, accuracy_frame))
        return results

    def load_model(self, verbose=True):
        print('\tloading model ...')
        self.model = CRFTagger(verbose=verbose)
        self.model.set_model_file(self.model_fname)
def crftagger(hasil_stem):
    result = []
    ct = CRFTagger()
    ct.set_model_file('D://dataset/all_indo_man_tag_corpus_model.crf.tagger')
    for i in hasil_stem:
        hasil = ct.tag_sents([i])
        for j in hasil:
            result.append(j)
    return result
Exemplo n.º 9
0
def getData(filename):
    ct = CRFTagger()
    ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')

    result = []
    annotated = []
    with open(filename + '.csv', 'r') as f:
        reader = csv.reader(f)
        annotated = list(reader)

    sent = []
    sent_gold = []
    sent_oh = []
    curr_sent = ''
    for token in annotated:
        if curr_sent != str(token[1]) + ' ' + str(token[2]):
            hasil = ct.tag_sents([sent])
            mytuple = []
            for idx in range(len(sent)):
                try:
                    mytuple.append(hasil[0][idx] +
                                   (sent_gold[idx], sent_oh[idx]))
                except IndexError:
                    pass
            result.append(mytuple)
            sent = []
            sent_gold = []
            sent_oh = []
            curr_sent = str(token[1]) + ' ' + str(token[2])
        sent.append(token[4])
        sent_gold.append(token[5])
        sent_oh.append(token[6])
    hasil = ct.tag_sents([sent])
    mytuple = []
    for idx in range(len(sent)):
        try:
            mytuple.append(hasil[0][idx] + (sent_gold[idx], sent_oh[idx]))
        except:
            pass
    result.append(mytuple)
    result = result[1:]
    print('Total sentence: ' + str(len(result)))
    random.shuffle(result)
    return result
Exemplo n.º 10
0
def main(no_stopwords, use_manual_train_set):

	print "MAINTAIN COMMON WORDS: " + str(not no_stopwords)
	print "USING HAND LABELED TRAIN DATA: " + str(use_manual_train_set)

	full_set = get_domain_set(no_stopwords)
	if not no_stopwords:
		full_set.extend(get_other_set())

	train_set, test_set_auto = divide_sets(full_set, 0.75)
	set_manual = get_manual_set(no_stopwords)

	train_set_manual = []
	test_set_manual = []
	if use_manual_train_set:
		train_set_manual, test_set_manual = divide_sets(set_manual, 0.28)
		train_set.extend(train_set_manual)
	else:
		test_set_manual = set_manual

	tagger = CRFTagger(feature_func=feature_extraction)
	try:
		tagger.train(train_set, 'laptop.crf.tagger')
	except ValueError:
		fi = open('DEBUG', 'w')
		for li in DEBUG:
			fi.write(str(li.encode('utf-8')) + '\n')
		fi.close()

	print "AUTOMATIC LABELED TEST"
	tagged_sents_auto = tagger.tag_sents(map_test_set(test_set_auto, word=True))
	predicted_auto = create_vector_of_predicted_labels(tagged_sents_auto)
	golden_auto = create_vector_of_predicted_labels(test_set_auto)

	print calculate_micro_accuracy(predicted_auto, golden_auto, no_stopwords)

	print "MANUAL LABELED TEST"
	tagged_sents_manual = tagger.tag_sents(map_test_set(test_set_manual, word=True))
	predicted_manual = create_vector_of_predicted_labels(tagged_sents_manual)
	golden_manual = create_vector_of_predicted_labels(test_set_manual)
	
	print calculate_micro_accuracy(predicted_manual, golden_manual, no_stopwords)
	print ""
def Postagging(data):
    postaggedData = []
    postagOnly = []
    ct = CRFTagger()
    ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')
    postaggedData = ct.tag_sents(data)
    for i in range(len(postaggedData)):
        for j in range(len(postaggedData[i])):    
            postagOnly.append(postaggedData[i][j][1])
    return postagOnly
Exemplo n.º 12
0
def getPosTag():
    global perLabel, jobLabel, subLabel, orgLabel, geoLabel
    raw_sent = sentInput.get()
    ct = CRFTagger()
    ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')

    tokens = nltk.tokenize.word_tokenize(raw_sent)
    postagged = ct.tag_sents([tokens])

    data = []
    for token in postagged[0]:
        data.append(token + ('O', ))

    tagger_ner = pycrfsuite.Tagger()
    tagger_ner.open('model_ner.crfsuite')
    ner = tagger_ner.tag(sent2features(data, False))

    for i in range(len(ner)):
        data[i] = data[i][0:2] + (ner[i], )

    tagger_oh = pycrfsuite.Tagger()
    tagger_oh.open('model_oh.crfsuite')
    oh = tagger_oh.tag(sent2features(data, True))

    for i in range(len(oh)):
        data[i] += (oh[i], )

    per = []
    job = []
    sub = []
    org = []
    geo = []

    for token in data:
        if token[3] == '1':
            label = token[2][-3:]
            if label == 'PER':
                per.append(token[0])
            elif label == 'ORG':
                org.append(token[0])
            elif label == 'SUB':
                sub.append(token[0])
            elif label == 'JOB':
                job.append(token[0])
            elif label == 'GEO':
                geo.append(token[0])
    perLabel.config(text='PER: ' + (' ').join(per))
    jobLabel.config(text='JOB: ' + (' ').join(job))
    subLabel.config(text='SUB: ' + (' ').join(sub))
    orgLabel.config(text='ORG: ' + (' ').join(org))
    geoLabel.config(text='GEO: ' + (' ').join(geo))
def pos_tagger(data, attr="paragraphs"):
    flatten = lambda l: [item for sublist in l for item in sublist]
    ct = CRFTagger()
    ct.set_model_file('dataset/all_indo_man_tag_corpus_model.crf.tagger')
    for category in data:
        category['word_tag_{}'.format(attr)] = []
        for paragraph in category[attr]:
            list_tag_kalimat = []
            for kalimat in paragraph:
                tag_kalimat = ct.tag_sents([kalimat])
                tag_kalimat = flatten(tag_kalimat)
                list_tag_kalimat.append(tag_kalimat)
            category['word_tag_{}'.format(attr)].append(list_tag_kalimat)
    return data
Exemplo n.º 14
0
def oninfolist():
    """NU DOEN: KIJK NAAR FORMAT VAN GEGEVEN INFORMATIE OP INTERNET IN VOORBEELD, CHECK ALLE LIJSTEN DIE IK GEMAAKT HEB OF ZE OVEREENKOMEN MET DE VORM"""

    #SEE: http://www.nltk.org/_modules/nltk/tag/crf.html
    infolist = pickle.load(open('sentencelist.pickle', 'rb'))
    limit = round(len(infolist) * 0.4)
    train_data = infolist[0:limit]
    #print("train_data = ", train_data[0:10])

    ct = CRFTagger()
    #print(infolist[0:10])

    realsentences = []
    realsentence = ""
    """
	for sentence in infolist[limit:]:
		for (word,nertag) in sentence:
			realsentence = realsentence +" "+ word
		realsentences.append(realsentence)
		realsentence = ""
	pickle.dump(realsentences,open("realsentences.pickle","wb"))
	print("pickle-bestand gemaakt")
	"""
    realsentences = pickle.load(open("realsentences.pickle", "rb"))
    print("REALSENTENCES:", realsentences[0:10])
    splitsentences = []  #[['dog','is','good'],['cat','eat','meat']]
    for r in realsentences:
        splitsentence = r.split()
        splitsentences.append(splitsentence)

    #print("train_data:", infolist[0:10])
    #print("sentences for tag_sents:", splitsentences[0:10])
    ct.tag_sents(splitsentences[limit:])
    gold_sentences = infolist[limit:]
    print("GOLD SENTENCES:", infolist[10:20])
    print(ct.evaluate(gold_sentences))
def pos_tagger(text):  # input: teks/String
    # instansiasi
    ct = CRFTagger()

    # load model tagger indonesia
    ct.set_model_file('model_postagging_crf.tagger')

    # cleaning
    text = re.sub('\.?\,?\(?\)?\"?', '', text)
    text = re.sub("\n", " ", text)
    text = text.split(" ")

    # ini fungsi untuk melakukan postagging
    tagged_text = ct.tag_sents([text])

    # hasil
    return tagged_text  # output: teks yang sudah diberi pos_tag
Exemplo n.º 16
0
def chunking(sents, chunked_file):
    '''
    Chunking
    param sents: 列表,如[['dog', 'is', 'dog'], ['dog', 'good']]
    '''
	
    os.chdir('/home/zqr/code/chunk2vec/')

    start_time = time.time()
    #PoS
    print '\n-->Start PoS'
    #print '->Training PoS Tagger'
    #ct = CRFTagger()
    #ct.train(chunk_traindata(pos_trainfile), 'model.crf.tagger')
    #print '->Done'
    
    #pos_testdata_gold = chunk_traindata(pos_testfile)
    
    # pos corpus
    print '->Load CRF Tagger model'
    ct = CRFTagger()
    ###这个model是从chunk任务中学习到的PoS标签
    ct.set_model_file('model.crf.tagger')
    print '->Posing'
    tagged_sents = ct.tag_sents(sents)
    #print 'PoS acc.:', ct.evaluate(pos_testdata_gold)
    #将PoS好的句子写文件
    print '->Write posed file'
    pos_data(tagged_sents, 'tmp_for_chunking')
    end_time = time.time()
    print '-->Done, Time:', end_time - start_time, 's'
    #节省时间,暂时用测试语料
    #pos_data(pos_testdata_gold, chunk_inputfile)
        
    start_time = time.time()
    ###Chunk,依赖系统安装YamCha,训练语料就用CoNLL的训练语料
    print '\n-->Start Chunking'
    os.system('yamcha-config --libexecdir')
    #os.chdir('/home/zqr/code/sent2vec/')
    os.system('cp /home/zqr/local/libexec/yamcha/Makefile .')
    #训练chunking模型
    #os.system('make CORPUS=' + pos_trainfile +' MODEL=chunk_model train')
    os.system('yamcha -m chunk_model.model < tmp_for_chunking > ' + chunked_file)
    print '-->Done, Time:', time.time() - start_time, 's'
Exemplo n.º 17
0
def main():
    import pickle
    from nltk.tag import CRFTagger
    infolist = pickle.load(open('infolist.pickle', 'rb'))

    ct = CRFTagger()

    train_data = [[(x, z)
                   for [x, y, z] in infolist[:round(0.9 * len(infolist))]]]

    ct.train(train_data, 'model.crf.tagger')
    ners = ct.tag_sents(
        [[x for [x, y, z] in infolist[round(0.9 * len(infolist)):]]])
    print(ners)

    gold_sentences = [[(x, z)
                       for [x, y, z] in infolist[round(0.9 * len(infolist)):]]]

    ct.evaluate(gold_sentences)
    print(ct.evaluate(gold_sentences))
Exemplo n.º 18
0
def load(training, testing):
    ct = CRFTagger()
    # split the training into sentences
    t = "\n".join(training)
    sents = t.split("###/###")
    # split the sentences into tokens
    train = []
    for sent in sents:
        if sent:
            new = []
            words = sent.split("\n")
            for word in words:
                if word:
                    # split the tokens into word and tag
                    new.append(tuple(word.split("/")))
            train.append(new)
    # remove any blank sentences that have been added
    for t in train:
        if not t:
            train.remove(t)
    ct.train(train, 'model.crf.tagger')
    # test on the testing data
    s = "\n".join(testing)
    s_sents = s.split("###/###")
    test = []
    sent_tags = []
    for t in s_sents:
        if t:
            new = []
            right_tags = []
            words = t.split("\n")
            for word in words:
                if word:
                    # split the tokens into just words
                    new.append(word.split("/")[0])
                    # save the tags in a list to be used later
                    right_tags.append(word.split("/")[1])
            sent_tags.append(right_tags)
            test.append(new)
    tags = ct.tag_sents(test)
    return tags, sent_tags
Exemplo n.º 19
0
def main():
    # start timer
    for item in [
            "UD_Ukrainian",
            "Brown",
    ]:

        print("in process " + item)
        # open Brown training data
        infile = open(DATA_PATH + item + "_tagged_train.txt",
                      "r",
                      encoding="utf-8")
        brown_train = infile.readlines()
        infile.close()

        # split words and tags, and add start and stop symbols (question 1)
        brown_words, brown_tags = split_wordtags(brown_train)

        # calculate tag trigram probabilities (question 2)
        q_values = calc_trigrams(brown_tags)

        # question 2 output
        q2_output(q_values, OUTPUT_PATH + item + '_B2.txt')

        # calculate list of words with count > 5 (question 3)
        known_words = calc_known(brown_words)

        # get a version of brown_words with rare words replace with '_RARE_' (question 3)
        brown_words_rare = replace_rare(brown_words, known_words)

        # question 3 output
        q3_output(brown_words_rare, OUTPUT_PATH + item + "_B3.txt")

        # calculate emission probabilities (question 4)
        e_values, taglist = calc_emission(brown_words_rare, brown_tags)

        # question 4 output
        q4_output(e_values, OUTPUT_PATH + item + "_B4.txt")

        # delete unneceessary data
        del brown_train
        del brown_words_rare

        # open Brown development data (question 5)
        infile = open(DATA_PATH + item + "_test.txt", "r")
        brown_dev = infile.readlines()
        infile.close()

        # format Brown development data here
        brown_dev_words = []
        for sentence in brown_dev:
            brown_dev_words.append(sentence.split(" ")[:-1])

        # do viterbi on brown_dev_words (question 5)
        viterbi_tagged = viterbi(brown_dev_words, taglist, known_words,
                                 q_values, e_values)

        # question 5 output
        q5_output(viterbi_tagged, OUTPUT_PATH + item + "_B5.txt")

        # # do nltk tagging here
        # nltk_tagged = nltk_tagger(brown_words, brown_tags, brown_dev_words)
        #
        # # question 6 output
        # q6_output(nltk_tagged, OUTPUT_PATH + item + "_B6.txt")

    for item in ["Brown", "UD_Ukrainian"]:
        print("in crf process " + item)
        # open Brown training data
        infile = open(DATA_PATH + item + "_tagged_train.txt",
                      "rb",
                      encoding="utf-8")
        brown_train = infile.readlines()
        infile.close()

        brown_words, brown_tags = split_wordtags(brown_train)
        train_words_tags = []
        ct = CRFTagger()
        for i in range(len(brown_words)):
            tmp = []
            for j in range(len(brown_words[i])):
                tmp.append((brown_words[i][j].decode('utf-8'),
                            brown_tags[i][j].decode('utf-8')))
            train_words_tags.append(tmp)

        ct.train(train_words_tags, u'model.crf.tagger')

        # open Brown development data (question 5)
        infile = open(DATA_PATH + item + "_test.txt", "r")
        brown_dev = infile.readlines()
        infile.close()

        # format Brown development data here
        tests_words = []
        for sentence in brown_dev:
            tests_words.append([i for i in sentence.split(" ")[:-1]])

        result_cfg = ct.tag_sents(tests_words)
        with open(OUTPUT_PATH + item + "_CFG.txt", "w") as file:
            for line in result_cfg:
                for word in line:
                    file.write(word[0] + "/" + word[1] + " ")
                file.write("\n")

        # print total time to run Part B
        print("Part B time: ", str(time.clock()), ' sec')
Exemplo n.º 20
0
with open('inputText.txt', 'r') as myfile:
    data = myfile.read().replace('\n', '')

import nltk
tokens = nltk.word_tokenize(data)

from nltk.tag import CRFTagger
ct = CRFTagger()
ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')
hasil = ct.tag_sents([tokens])

tagging = ""
for tokenTag in hasil[0]:
    token, tag = tokenTag
    tagging += token + "\t" + tag + "\n"

with open("outputText.txt", "w") as text_file:
    text_file.write(tagging)
Exemplo n.º 21
0
class Chunker:
    UNIQ = '_UNIQUE_STRING_'
    CHUNK_PARSER = None
    """
	"""
    def __init__(self):
        # Memuat data pre-trained POS-Tagger
        uni, bi, tri, word = self.load_obj("tagger")
        self.TAGGER1 = Tagger(uni, bi, tri, word)

        # Memuat data pre-trained POS-Tagger
        uni2, bi2, tri2, word2 = self.load_obj("tagger2")
        self.TAGGER2 = Tagger(uni2, bi2, tri2, word2)

        self.TAGGER3 = CRFTagger()
        self.TAGGER3.set_model_file(
            'postagg/dataset/all_indo_man_tag_corpus_model.crf.tagger')

        # Memuat data grammar chunker
        self.load_chunker()

    """
	"""

    def load_obj(self, name):
        with open('postagg/' + 'obj/' + name + '.pkl', 'rb') as f:
            return pickle.load(f)

    """
	Melakukan formatting string menjadi regex
	"""

    def format_to_re(self, format):
        parts = (format % MarkPlaceholders()).split(self.UNIQ)
        for i in range(0, len(parts), 2):
            parts[i] = re.escape(parts[i])

        return ' '.join(parts).replace('\\', '')

    """
	Mengubah tree POS Tag menjadi tree chunk
	"""

    def tree_to_str(self, tree_data):
        ne_in_sent = []
        for subtree in tree_data:
            if type(subtree
                    ) == Tree:  # If subtree is a noun chunk, i.e. NE != "O"
                ne_label = subtree.label()
                ne_string = " ".join(
                    [token for token, pos in subtree.leaves()])
                ne_in_sent.append((ne_string, ne_label))
            else:
                ne_in_sent.append((subtree[0], subtree[1]))

        return ne_in_sent

    """
	Memuat rule chunk
	"""

    def load_chunker(self):
        try:
            f = open('postagg/dataset/phrase_chunker_grammar_id.txt')
            files = self.format_to_re(f.read())
            grammars = files
            f.close()

            self.CHUNK_PARSER = nltk.RegexpParser(grammars)

        except Exception as e:
            print(str(e))

    """
	Mengubah tree chunk menjadi list of chunk
	dalam bentuk list of string
	"""

    def get_only_str(self, tree_chunk):
        output = []
        for chunk, tag in tree_chunk:
            output.append(chunk)

        return output

    """
	Mengubah list of chunk(string) menjadi string
	dengan format: [chunk1] [chunk2] ... [chunkN]
	"""

    def beautify(self, chunks):
        strout = ""
        for s in chunks:
            strout += "[" + s + "] "

        return strout

    """
	Memberi POSTag pada setiap kata pada kalimat
	Melakukan chunking kalimat
	Mengembalikan chunk Tree
	"""

    def chunk_me1(self, _str):
        return self.CHUNK_PARSER.parse(
            self.TAGGER1.tagSentence(_str.split(" ")))

    """
	Memberi POSTag pada setiap kata pada kalimat
	Melakukan chunking kalimat
	Mengembalikan chunk Tree
	"""

    def chunk_me2(self, _str):
        return self.CHUNK_PARSER.parse(
            self.TAGGER2.tagSentence(_str.split(" ")))

    """
	"""

    def chunk_me3(self, _str):
        _strs = _str.split(" ")
        strs = []
        for s in _strs:
            strs.append(s)

        return self.CHUNK_PARSER.parse(self.TAGGER3.tag_sents([strs])[0])
Exemplo n.º 22
0
    while line:
        words=line.replace("\r","").replace("\n","").split("\t")
        #print(words)
        if(len(words)<2):
            test_actual.append(test)
            test_sentences.append(sentence)
            test=[]
            sentence=[]
        else:
            tup1=(words[0],words[1])
            sentence.append(words[0])
            test.append(tup1)
        line=f.readline()
    f.close()

res = ct.tag_sents(test_sentences)
tagged_result = []
tagged_actual = []
for i in range(len(res)):
   for j in range(len(res[i])):
	tagged_result.append(res[i][j][1])
	tagged_actual.append(test_actual[i][j][1])
print res[0]
print test_actual[0]
#print tagged_result[0]
#print tagged_actual[0]

gold_sentences=test_actual
accuracy = ct.evaluate(gold_sentences)
print "accuracy:"+str(accuracy)
Exemplo n.º 23
0
class NERFeatureExtractor:
    def read_label_file(self, filename):
        return open(filename).read().split('\n')

    def __init__(self, iob_predictor):
        self.iob_predictor = iob_predictor
        self.stemmer = StemmerFactory().create_stemmer()
        self.TAGGER3 = CRFTagger()
        self.TAGGER3.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')
        self.label_words = self.read_label_file('label-words.txt')
        self.label_posses = self.read_label_file('label-posses.txt')
        self.label_lemmas = self.read_label_file('label-lemmas.txt')
        self.label_iob_feature = self.read_label_file('label-iob_feature.txt')
        self.label_iob_classes = self.read_label_file('label-iob_classes.txt')

    def getPOSTag(self, _temporary_tokens):
        strin = []
        for token_tag in _temporary_tokens:
            strin.append(unicode(token_tag.decode('utf-8')))

        return [(token.encode('ascii',
                              'ignore'), tag.encode('ascii', 'ignore'))
                for (token, tag) in self.TAGGER3.tag_sents([strin])[0]]

    def features(self, tokens, index, history):
        # print history
        # print tokens
        """
		`tokens`  = a POS-tagged sentence [(w1, t1), ...]
		`index`   = the index of the token we want to extract features for
		`history` = the previous predicted IOB tags
		"""

        # Pad the sequence with placeholders
        tokens = [
            ('[START2]', '[START2]'), ('[START1]', '[START1]')
        ] + list(tokens) + [('[END1]', '[END1]'), ('[END2]', '[END2]')]
        history = ['[START2]', '[START1]'] + list(history)

        # shift the index with 2, to accommodate the padding
        index += 2

        word, pos = tokens[index]
        prevword, prevpos = tokens[index - 1]
        prevprevword, prevprevpos = tokens[index - 2]
        nextword, nextpos = tokens[index + 1]
        nextnextword, nextnextpos = tokens[index + 2]
        previob = history[index - 1]
        contains_dash = '-' in word
        contains_dot = '.' in word
        allascii = all([True for c in word if c in string.ascii_lowercase])

        allcaps = word == word.capitalize()
        capitalized = word[0] in string.ascii_uppercase

        prevallcaps = prevword == prevword.capitalize()
        prevcapitalized = prevword[0] in string.ascii_uppercase

        nextallcaps = prevword == prevword.capitalize()
        nextcapitalized = prevword[0] in string.ascii_uppercase

        return [
            word,
            str(self.stemmer.stem(word)),
            str(pos),
            str(allascii),
            str(nextword),
            str(self.stemmer.stem(nextword)),
            str(nextpos),
            str(nextnextword),
            str(nextnextpos),
            str(prevword),
            str(self.stemmer.stem(prevword)),
            str(prevpos),
            str(prevprevword),
            str(prevprevpos),
            str(previob),
            str(contains_dash),
            str(contains_dot),
            str(allcaps),
            str(capitalized),
            str(prevallcaps),
            str(prevcapitalized),
            str(nextallcaps),
            str(nextcapitalized)
        ]

    def normalizeFeature(self, featx):
        out = []
        if featx[0] in self.label_words:
            out.append(self.label_words.index(featx[0]))
        else:
            out.append(-1)

        if featx[1] in self.label_lemmas:
            out.append(self.label_lemmas.index(featx[1]))
        else:
            out.append(-1)

        if featx[2] in self.label_posses:
            out.append(self.label_posses.index(featx[2]))
        else:
            out.append(-1)

        out.append(1 if featx[3] else 0)

        if featx[4] in self.label_words:
            out.append(self.label_words.index(featx[4]))
        else:
            out.append(-1)

        if featx[5] in self.label_lemmas:
            out.append(self.label_lemmas.index(featx[5]))
        else:
            out.append(-1)

        if featx[6] in self.label_posses:
            out.append(self.label_posses.index(featx[6]))
        else:
            out.append(-1)

        if featx[7] in self.label_words:
            out.append(self.label_words.index(featx[7]))
        else:
            out.append(-1)

        if featx[8] in self.label_posses:
            out.append(self.label_posses.index(featx[8]))
        else:
            out.append(-1)

        if featx[9] in self.label_words:
            out.append(self.label_words.index(featx[9]))
        else:
            out.append(-1)

        if featx[10] in self.label_lemmas:
            out.append(self.label_lemmas.index(featx[10]))
        else:
            out.append(-1)

        if featx[11] in self.label_posses:
            out.append(self.label_posses.index(featx[11]))
        else:
            out.append(-1)

        if featx[12] in self.label_words:
            out.append(self.label_words.index(featx[12]))
        else:
            out.append(-1)

        if featx[13] in self.label_posses:
            out.append(self.label_posses.index(featx[13]))
        else:
            out.append(-1)

        if featx[14] in self.label_iob_feature:
            out.append(self.label_iob_feature.index(featx[14]))
        else:
            out.append(-1)

        out.append(1 if featx[15] else 0)
        out.append(1 if featx[16] else 0)
        out.append(1 if featx[17] else 0)
        out.append(1 if featx[18] else 0)
        out.append(1 if featx[19] else 0)
        out.append(1 if featx[20] else 0)
        out.append(1 if featx[21] else 0)
        out.append(1 if featx[22] else 0)

        return out

    def parseEntityName(self, _sent=""):
        tokens = self.getPOSTag(_sent.split())
        history = []
        self.res_all = []
        last_feature = []
        for i in range(len(tokens)):
            last_feature = self.features(tokens, i, history)
            iob_res = self.iob_predictor([self.normalizeFeature(last_feature)
                                          ])[0]
            history.append(iob_res)
            self.res_all.append((tokens[i], self.label_iob_classes[iob_res]))
Exemplo n.º 24
0
def tag_strings(path_to_model, tokenized_string):
    ct = CRFTagger()
    ct.set_model_file(path_to_model)
    tagged_strings = ct.tag_sents([tokenized_string])
    # print("Tagged Strings:", tagged_strings)
    return tagged_strings
Exemplo n.º 25
0
 sentences = ""
 for line in lines:
     arr = re.findall(r"[a-zA-Z]+", line)
     sentences = sentences + " " + " ".join([w for w in arr])
 paragraph_nouns = []
 if sentences.strip():
     for s in sentences.split("."):
         try:
             # s = remove_numbers(s)
             # s = remove_punctuation(s)
             # s = remove_stopwords(s)
             # s = remove_english_stopwords(s)
             s = remove_single_char(s)
             # s = stem_text(s)
             # s = stem_english_text(s)
             hasil = ct.tag_sents([s.split()])
             temp_noun = ""
             sentence_nouns = []
             prev_pos = ""
             for text, pos in hasil[0]:
                 # print("{}:{}".format(text,pos))
                 if (pos == "NN" or pos == "NNP") and (prev_pos == "NN" or
                                                       prev_pos == "NNP"):
                     if len(temp_noun.split()) < 2:
                         temp_noun = temp_noun + " " + text
                         temp_noun = str(temp_noun).lower()
                 elif (pos != "NN" or pos != "NNP") and (prev_pos == "NN" or
                                                         prev_pos == "NNP"):
                     if temp_noun:
                         temp_noun = remove_punctuation(temp_noun)
                         total_nouns.append(temp_noun)
Exemplo n.º 26
0
from nltk.tag import CRFTagger
ct = CRFTagger()
ct.set_model_file('model/all_indo_man_tag_corpus_model.crf.tagger')
hasil = ct.tag_sents([['Saya', 'bekerja', 'di', 'Bandung']])
Exemplo n.º 27
0
# In[26]:

TAGGER_PATH = "crfpostagger"   # pre-trained POS-tagger


# In[27]:

tagger = CRFTagger()  # initialize tagger
tagger.set_model_file(TAGGER_PATH)


# In[30]:

# try some sentences out- must all be unicode strings- trained on lower case
print(tagger.tag([u"i", u"like", u"revision"]))
print(tagger.tag([u"i", u"like", u"natural", u"language", u"processing"]))


# In[31]:

# scaling up as you might get them in text- make sure unicode and lower case
sentences = ["I like revision",
            "I like Natural Language Processing"]
print(tagger.tag_sents([unicode(word.lower()) for word in s.split()] for s in sentences))


# In[ ]:



Exemplo n.º 28
0
from nltk.tag import CRFTagger

jumSample = 500000
namaFile = "Indonesian_Manually_Tagged_Corpus.tsv"
with open(namaFile, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

pasangan = []
allPasangan = []

for line in lines[:min(jumSample, len(lines))]:
    if line == '':
        allPasangan.append(pasangan)
        pasangan = []
    else:
        kata, tag = line.split('\t')
        p = (kata, tag)
        pasangan.append(p)

ct = CRFTagger()
ct.train(allPasangan, 'all_indo_man_tag_corpus_model.crf.tagger')
# test
hasil = ct.tag_sents([['Saya', 'bekerja', 'di', 'Bandung'],
                      ['Nama', 'saya', 'Yudi']])
print(hasil)
Exemplo n.º 29
0

#CRF Training
ct = CRFTagger()
print "CRF Training starts..."
ct.train(ListOfSentences_Training,'model.crf.tagger')
print "CRF Training is done."

print "Testing starts"
print "Accuracy of CRF is = ",ct.evaluate(ListOfSentences_Test) * 100
#Tagging by CRF Tagger
ch = 'y'
while (ch != 'n'):
    text = raw_input("Enter the text to be tagged : \n")
    text = converter(text)
    print ct.tag_sents(text)
    print "\nDo you want to continue ?"
    ch = raw_input()
 



#HMM Training

print "HMM Training using HiddenMarkovModelTrainer() starts.."
hmmTrain1 = HiddenMarkovModelTrainer().train_supervised(ListOfSentences_Training)
print "Training is completed.\n"

print "Testing starts now.."
hmmTrain1.test(ListOfSentences_Test)
print "Testing is completed.."
Exemplo n.º 30
0
    crf_tagger = CRFTagger()

    tnt_tagger.train(training_data[ki])
    hmm_tagger = nltk.HiddenMarkovModelTagger.train(training_data[ki])
    perc_tagger.train(training_data[ki])
    crf_tagger.train(training_data[ki], 'model.crf.tagger')

    # t.tagdata(test_data[800:])

    perc_pred = []
    hmm_pred = []

    for i in testing_data[ki]:
        perc_pred.append(perc_tagger.tag(i))
        hmm_pred.append(hmm_tagger.tag(i))
    crf_pred = crf_tagger.tag_sents(testing_data[ki])
    tnt_pred = tnt_tagger.tagdata(testing_data[ki])
    pred = {'p': perc_pred, 'h': hmm_pred, 'c': crf_pred, 't': tnt_pred}

    def most_frequent(List):
        return max(set(List), key=List.count)

    import itertools

    def picker(tag_seq, i, j):
        tags = []
        for k in tag_seq:
            tags.append(pred[k][i][j][1])
        return tags, most_frequent(tags)

    s = 'phct'
Exemplo n.º 31
0
def onsentencelist():
        ct = CRFTagger()

        """sentencelist contains nertaged sentences"""
        sentencelist = pickle.load(open('sentencelist.pickle','rb'))

        """training size as percentage"""
        trainingsize = 0.9

        """ calculate where to split data """
        limit = round(trainingsize*len(sentencelist))

        """wordsentencelist contains the same sentences not ner-tagged"""
        wordsentencelist = pickle.load(open("wordsentencelist.pickle","rb"))

        
        """train the data / choose one of the 2 blocks """
        #train_data = sentencelist[:limit]
        #ct.train(train_data,'model.crf.tagger')
        ct.set_model_file('tweetmodel.crf.tagger')
        

        """Test data and evaluate"""
        test_data = wordsentencelist[limit:]
        ct.tag_sents(test_data) # tagging sentences
        gold_sentences = sentencelist[limit:]
        print("\nAccuracy:", ct.evaluate(gold_sentences))


        """ TURN TRAINED TAGGED LIST AND TEST LIST INTO ONE LIST CONTAINING
        ONLY THE TRUE AND PREDTAGS"""
        pred_nerlist = []
        for sentence in wordsentencelist[:limit]:
                for (word,nertag) in ct.tag(sentence):
                        #pred_nerlist.append((word,nertag))
                        pred_nerlist.append(nertag.lower())
                        
        true_nerlist = []
        #ct_true = gold_sentences
        for sentence in sentencelist[:limit]:
                for (word,nertag) in sentence:
                        #true_nerlist.append((word,nertag))
                        true_nerlist.append(nertag.lower())
        
        """ Print baseline """
        #print("\nBaseline = 0.9048987094135446 (everything tagged O)")

        
        """"Print F-score and confusion matrix """
        #print(len(pred_nerlist))
        #print(len(true_nerlist))
        
        """"Print F-score and confusion matrix """        
        print("\nF-score (micro):", f1_score(true_nerlist, pred_nerlist, average='micro') )
        print("\nF-score (macro):", f1_score(true_nerlist, pred_nerlist, average='macro') )
        print("\nF-score (weigthed):", f1_score(true_nerlist, pred_nerlist, average='weighted') )
        print("\nF-score (None):", f1_score(true_nerlist, pred_nerlist, average=None, labels=["o","b-per","i-per","b-loc","i-loc","b-org","i-org","b-misc","i-misc"]))
        
        
        print("\nConfusion matrix:\n")
        for item in ["O","B-per","I-per","B-loc","I-loc","B-org","I-org","B-misc","I-misc"]: print("  ",item,end="")
        print("\n",confusion_matrix(true_nerlist, pred_nerlist,labels = ["o","b-per","i-per","b-loc","i-loc","b-org","i-org","b-misc","i-misc"]))
Exemplo n.º 32
0
from nltk.tag import CRFTagger
ct = CRFTagger()

train_data = [[('Universiteit', 'Noun'), ('is', 'Verb'), ('een', 'Det'),
               ('goed', 'Adj'), ('goede', 'Adj'), ('plek', 'Noun'),
               ('hond', 'Noun'), ('eet', 'Verb'), ('vlees', 'Noun')]]

ct.train(train_data, 'model.crf.tagger')
ct.tag_sents([['hond', 'is', 'goed'], ['kat', 'eet', 'vlees']])

gold_sentences = [[('hond', 'Noun'), ('is', 'Verb'), ('goed', 'Adj')],
                  [('kat', 'Noun'), ('eet', 'Verb'), ('vlees', 'Noun')]]
ct.evaluate(gold_sentences)

ct = CRFTagger()
ct.set_model_file('model.crf.tagger')
print(ct.evaluate(gold_sentences))
Exemplo n.º 33
0
class DataAdapter(object):
    def __init__(self, data=[]):
        self.tagger = CRFTagger()
        self.tagger.set_model_file('model.crf.tagger')

        if data.count(True) > 0:
            self.data_tagging, self.data_testing = self.for_tagging_testing(
                data)
            # print('TAGGING', self.data_tagging)
            # print('TESTING', self.data_testing)

    def tokenize_tag(self, text):
        text = text.replace('\r', ' | ').replace('\n', ' | ')
        tokens = word_tokenize(text, preserve_line=True)
        labels = []
        for label in self.tag(tokens):
            labels.append(label[1])
        return tokens, labels

    def for_tagging_testing(self, data):
        # self.data = data
        array_tagging = []
        array_testing = []
        for d in data:
            all_tags = []
            all_test = []
            for index, t in enumerate(d['text']):
                one_tag = [t, d['label'][index]]
                all_test.append(one_tag)
                all_tags.append(t)
            array_tagging.append(all_tags)
            array_testing.append(all_test)
            # print(all_tags)
        return array_tagging, array_testing

    def for_testing(self, data):
        # self.data = data
        array = []
        # print('TEST', data.count())
        for d in data:
            all_tags = []
            for index, t in enumerate(d['text']):
                # one_tag = [t, (d['label'][index] if is_ascii(d['label'][index]) else 'O')]
                one_tag = [t, d['label'][index]]
                all_tags.append(one_tag)
            array.append(all_tags)
            # print(all_tags)
        return array

    def for_tagging(self, data):
        # self.data = data
        array = []
        for d in data:
            all_tags = []
            for t in d['text']:
                all_tags.append(t)
            array.append(all_tags)
            # print(all_tags)
        return array

    def tag_sents(self):
        if self.data_tagging is not None:
            return self.tagger.tag_sents(self.data_tagging)
        else:
            return 'NoData'

    def tag(self, data):
        return self.tagger.tag(data)

    def evaluate(self):
        if self.data_testing is not None:
            return self.tagger.evaluate(self.data_testing)
        else:
            return 'NoData'

    def train(self, data):
        data = self.for_testing(data)
        self.tagger.train(data, 'model.crf.tagger')
        print('ACCURACY:', self.tagger.evaluate(data))
Exemplo n.º 34
0
def tagger(msg):
    ct = CRFTagger()
    ct.set_model_file('model/all_indo_man_tag_corpus_model.crf.tagger')
    hasil = ct.tag_sents([split(msg)])
    return hasil[0]
Exemplo n.º 35
0
    # Stop Words
    StopWordFactory = StopWordRemoverFactory()
    StopWord = StopWordFactory.create_stop_word_remover()
    # Stemming
    StemFactory = StemmerFactory()
    Stemmer = StemFactory.create_stemmer()
    # pos tagging
    ct = CRFTagger()
    ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')

    # # stop word
    # stop = StopWord.remove(kalimat)
    #tokenize
    tokenize = nltk.tokenize.word_tokenize(response)
    # pos tagging
    tag = ct.tag_sents([tokenize])

    print(tag)
    # print(direct)

    # nltk
    for i in tag[0]:
        # http request
        for DeviceData in DeviceDirect['data']:
            if (i[1] == 'NN'):
                # mencari NN untuk Device
                if (tokenize[j] == DeviceData['device_category']):
                    # hasil dari NN dengan membandungkan device pada database
                    id = DeviceData['id']
                    Device = DeviceData['device_category']
                    # print(