Python TnT示例，nltk.tag.tnt.TnT Python示例

示例#1

0

显示文件

                                               templates,
                                               deterministic=True)
    return trainer.train(training, **kwargs)


# Brill tagger using the previous backoff chain tagger
br_tagger = train_brill_tagger(bc_tagger, train_sents)
accuracy = br_tagger.evaluate(test_sents)
print(f"Accuracy of the brill tagger: {accuracy}\n")

# Saving pickle
with open('pickles/pos-taggers/brill_tagger.pickle', 'wb') as file:
    pickle.dump(br_tagger, file)

# TnT tagger with default tagger for unknown words
tnt_tagger = tnt.TnT(unk=df_tagger, Trained=True, N=200)
tnt_tagger.train(train_sents)
accuracy = tnt_tagger.evaluate(test_sents)
print(f"Accuracy of the tnt tagger: {accuracy}\n")

# Saving pickle
with open('pickles/pos-taggers/tnt_tagger.pickle', 'wb') as file:
    pickle.dump(tnt_tagger, file)


# Tagging using the wordnet
class WordNetTagger(SequentialBackoffTagger):
    """
        Class implementation of the wordnet tagger
    """
    def __init__(self, *args, **kwargs):

示例#2

0

显示文件

def run_test(my_corpus):
    if my_corpus == treebank:
        print 'Corpus Info:'
        print '  Corpus: treebank'
        print '  Tagged Sents:', len(my_corpus.tagged_sents())
        print '  Tagged Words:', len(my_corpus.tagged_words())
        my_tagged_sents = my_corpus.tagged_sents()
        my_sents = my_corpus.sents()
    elif my_corpus == brown:
        print 'Corpus Info:'
        print '  Corpus: brown'
        print '  Tagged Sents:', len(my_corpus.tagged_sents())
        print '  Tagged Words:', len(my_corpus.tagged_words())
        #print '  Tagged Sents (news):', len(my_corpus.tagged_sents(categories='news'))
        #print '  Tagged Words (news):', len(my_corpus.tagged_words(categories='news'))
        #my_tagged_sents = my_corpus.tagged_sents(categories='news')
        #my_sents = my_corpus.sents(categories='news')

        print '  Tagged Sents :', len(my_corpus.tagged_sents())
        print '  Tagged Words :', len(my_corpus.tagged_words())
        my_tagged_sents = my_corpus.tagged_sents()
        my_sents = my_corpus.sents()
    else:
        return

    fold = 5
    print 'Performing', fold, 'fold cross validation on corpus ...'
    train_accuracy = []
    test_accuracy = []
    train_runtime = []
    test_runtime = []

    for k in range(fold):
        train_data = [
            x for i, x in enumerate(my_tagged_sents) if i % fold != k
        ]
        validation_data = [
            x for i, x in enumerate(my_tagged_sents) if i % fold == k
        ]
        #test_data = [x for i, x in enumerate(my_sents) if i % fold == k]

        print 'Fold', k, ' has', len(train_data), 'train sentences and', len(
            validation_data), 'test sentences'
        tnt_pos_tagger = tnt.TnT()

        begin = time.time()
        tnt_pos_tagger.train(train_data)
        end = time.time()
        train_acc = tnt_pos_tagger.evaluate(train_data)
        train_accuracy.append(train_acc)
        train_runtime.append(end - begin)
        print '  Train accuracy =', train_acc, ' runtime =', end - begin

        begin = time.time()
        test_acc = tnt_pos_tagger.evaluate(validation_data)
        end = time.time()
        test_accuracy.append(test_acc)
        test_runtime.append(end - begin)
        print '  Test accuracy =', test_acc, ' runtime =', end - begin

    print 'Results:'
    print '%15s %15s %15s %15s %15s' % ('Fold', 'Train-Accuracy',
                                        'Train-Runtime', 'Test-Accuracy',
                                        'Test-Runtime')
    for k in range(fold):
        print '%15d %15.3f%% %15.5f %15.3f%% %15.5f' % (
            k, train_accuracy[k] * 100, train_runtime[k],
            test_accuracy[k] * 100, test_runtime[k])

    avg_train_acc = sum(train_accuracy) / len(train_accuracy)
    avg_train_runtime = sum(train_runtime) / len(train_runtime)
    avg_test_acc = sum(test_accuracy) / len(test_accuracy)
    avg_test_runtime = sum(test_runtime) / len(test_runtime)
    print '%15s %15.3f%% %15.5f %15.3f%% %15.5f' % (
        'Average', avg_train_acc * 100, avg_train_runtime, avg_test_acc * 100,
        avg_test_runtime)
    return

示例#3

0

显示文件

文件： CompareTaggers.py 项目： RuiXiaoDong/NLTK-Research

        brill.Template(brill.Word([2])),
        brill.Template(brill.Word([-2, -1])), #you can look at the combination of the previous two words to learn a transformation rule
        brill.Template(brill.Word([1, 2])),
        brill.Template(brill.Word([-3, -2, -1])),
        brill.Template(brill.Word([1, 2, 3])),
        brill.Template(brill.Word([-1]), brill.Word([1])),
    ]
    
    trainer = brill_trainer.BrillTaggerTrainer(initial_tagger, templates, deterministic=True)
    return trainer.train(train_sents, **kwargs)

defaultTagger = DefaultTagger('NN')
initialTagger = backoff_tagger(brown_train_sents, [UnigramTagger, BigramTagger, TrigramTagger], backoff=defaultTagger)
brillTagger = train_brill_tagger(initialTagger, brown_train_sents)

tnt_tagger = tnt.TnT(N=100)
tnt_tagger.train(brown_train_sents)

bigramTagger = BigramTagger(brown_train_sents)
trigramTagger = TrigramTagger(brown_train_sents)

print("------------Recommended Tagger------------")
print(nltk.pos_tag(sent))

print("------------Default Tagger------------")
print(defaultTagger.tag(sent))

print("------------Unigram Tagger Overrode------------")
unigramTagger = UnigramTagger(model={'Pierre': 'NN'})
print(unigramTagger.tag(sent))

示例#4

0

显示文件

文件： build_model.py 项目： groschene/cltk2

trainer = []
for r, d, f in os.walk(source_txt):
    trainer.append(f)

text = '\n\n'.join([get_tags(source_txt + tr) for tr in trainer[0]])

list_of_tags = Pos(text).get_words()
list_of_tags.sort()
to_train = list(list_of_tags for list_of_tags,_ in itertools.groupby(list_of_tags))


all_wrd = np.asarray(Pos(text).get_words())[:, 0]
target_pos = np.asarray(Pos(text).get_words())[:, 1]

tnt_tot = tnt.TnT()
tnt_tot.train([list(zip(list(all_wrd), list(target_pos)))])

with open(path_to_save_model + 'tnt.pkl', 'wb') as h:
    pickle.dump(tnt_tot, h)



if dedup:
    all_wrd = np.asarray(to_train)[:, 0]
    target_pos = np.asarray(to_train)[:, 1]



### filter POS to keep only value that appear more than 10 times
all_string = [x for x in target_pos]

示例#5

0

显示文件

文件： POSTagger.py 项目： imoizuddin/MorphAnaylzer

def hindi_model():
    train_data = indian.tagged_sents('hindi.pos')
    tnt_pos_tagger = tnt.TnT()
    tnt_pos_tagger.train(train_data)
    return tnt_pos_tagger

示例#6

0

显示文件

    def updateScore(self, score):
        self.score += score


if __name__ == '__main__':
    file = codecs.open("hindi_text.txt", "r", "utf-8")
    # Read the contents of the file into memory.
    train_data_file = file.read()
    file.close()

    train_data1 = train_data_file.splitlines()

    file1 = codecs.open("hindi_output.txt", "w+", "utf-8")

    train_data = indian.tagged_sents('hindi.pos')
    tnt_pos_tagger = tnt.TnT()
    tnt_pos_tagger.train(
        train_data)  # Training the tnt Part of speech tagger with hindi data

    for line in train_data1:
        s = tnt_pos_tagger.tag(nltk.word_tokenize(line))
        for x in s:
            for j in x:
                file1.write(j + " ")
        file1.write("\n")
    #Create dictionary key: Fixed Part Value: Idiom
    db_fp = defaultdict()
    #Create set containing CNFP
    db_cnfp = defaultdict(set)
    # Create set containing extra words
    db_extra_words = defaultdict(set)

示例#7

0

显示文件

          format(sys.argv[0]),
          file=sys.stderr)
    sys.exit(1)

posField = int(sys.argv[5]) - 1

with open(sys.argv[1], encoding='UTF-8') as FP_train,\
     open(sys.argv[2], encoding='UTF-8') as FP_test,\
     open(sys.argv[3], 'w', encoding='UTF-8') as FP_out:
    # XXX Unk not handled...
    # In Brant's version the default is:
    # sparse data : linear interpolation
    # unknown mode: statistics of singletons
    # using suffix trie up to length 10
    # case of characters is significant (Handled)
    tagger = tnt.TnT(C=True)
    sents = []
    sent = []
    print("Adding sentences...", file=sys.stderr)
    for line in FP_train:
        line = line.strip().split()
        if len(line) == 0:
            sents.append(sent)
            sent = []
        else:
            sent.append((line[posField], line[-1]))
    print("Training...", file=sys.stderr)
    tagger.train(sents)

    print("Tagging...", file=sys.stderr)
    sent = []

示例#8

0

显示文件

def cltk_pos_cv(full_training_set, local_dir_rel):
    print("full_training_set", full_training_set)

    unigram_accuracies = []
    bigram_accuracies = []
    trigram_accuracies = []
    backoff_accuracies = []
    tnt_accuracies = []

    with open(full_training_set) as f:
        training_set_string = f.read()

    pos_set = training_set_string.split('\n\n')  # mk into a list

    sentence_count = len(pos_set)  # 3473
    tenth = math.ceil(int(sentence_count) / int(10))

    random.seed(0)
    random.shuffle(pos_set)

    def chunks(l, n):
        """Yield successive n-sized chunks from l.
        http://stackoverflow.com/a/312464
        """
        for i in range(0, len(l), n):
            yield l[i:i+n]

    # a list of 10 lists
    ten_parts = list(chunks(pos_set, tenth))  # a list of 10 lists with ~347 sentences each

    #for counter in list(range(10)):
    for counter, part in list(enumerate(ten_parts)):
        # map test list to part of given loop
        test_set = ten_parts[counter]  # or: test_set = part
        
        # filter out this loop's test index
        training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]]
        
        # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 )
        training_set = [item for sublist in training_set_lists for item in sublist]
            
        # save shuffled tests to file (as NLTK trainers expect)
        #local_dir_rel = '~/cltk_data/user_data'
        local_dir = os.path.expanduser(local_dir_rel)
        if not os.path.isdir(local_dir):
            os.makedirs(local_dir)

        test_path = os.path.join(local_dir, 'test.pos')
        with open(test_path, 'w') as f:
            f.write('\n\n'.join(test_set))

        train_path = os.path.join(local_dir, 'train.pos')
        with open(train_path, 'w') as f:
            f.write('\n\n'.join(training_set))

        # read POS corpora
        print("local_dir", local_dir)
        train_reader = TaggedCorpusReader(local_dir, 'train.pos')
        train_sents = train_reader.tagged_sents()

        test_reader = TaggedCorpusReader(local_dir, 'test.pos')
        test_sents = test_reader.tagged_sents()
        
        print('Loop #' + str(counter))
        # make unigram tagger
        unigram_tagger = UnigramTagger(train_sents)
        # evaluate unigram tagger
        unigram_accuracy = None
        unigram_accuracy = unigram_tagger.evaluate(test_sents)
        unigram_accuracies.append(unigram_accuracy)
        print('Unigram:', unigram_accuracy)
        
        # make bigram tagger
        bigram_tagger = BigramTagger(train_sents)
        # evaluate bigram tagger
        bigram_accuracy = None
        bigram_accuracy = bigram_tagger.evaluate(test_sents)
        bigram_accuracies.append(bigram_accuracy)
        print('Bigram:', bigram_accuracy)
        
        # make trigram tagger
        trigram_tagger = TrigramTagger(train_sents)
        # evaluate trigram tagger
        trigram_accuracy = None
        trigram_accuracy = trigram_tagger.evaluate(test_sents)
        trigram_accuracies.append(trigram_accuracy)
        print('Trigram:', trigram_accuracy)
        
        # make 1, 2, 3-gram backoff tagger
        tagger1 = UnigramTagger(train_sents)
        tagger2 = BigramTagger(train_sents, backoff=tagger1)
        tagger3 = TrigramTagger(train_sents, backoff=tagger2)
        # evaluate trigram tagger
        backoff_accuracy = None
        backoff_accuracy = tagger3.evaluate(test_sents)
        backoff_accuracies.append(backoff_accuracy)
        print('1, 2, 3-gram backoff:', backoff_accuracy)
        
        # make tnt tagger
        tnt_tagger = tnt.TnT()
        tnt_tagger.train(train_sents)
        # evaulate tnt tagger
        tnt_accuracy = None
        tnt_accuracy = tnt_tagger.evaluate(test_sents)
        tnt_accuracies.append(tnt_accuracy)
        print('TnT:', tnt_accuracy)

    final_accuracies_list = []
    mean_accuracy_unigram = mean(unigram_accuracies)
    standard_deviation_unigram = stdev(unigram_accuracies)
    uni = {'unigram': {'mean': mean_accuracy_unigram, 'sd': standard_deviation_unigram}}
    final_accuracies_list.append(uni)

    mean_accuracy_bigram = mean(bigram_accuracies)
    standard_deviation_bigram = stdev(bigram_accuracies)
    bi = {'bigram': {'mean': mean_accuracy_bigram, 'sd': standard_deviation_bigram}}
    final_accuracies_list.append(bi)

    mean_accuracy_trigram = mean(trigram_accuracies)
    standard_deviation_trigram = stdev(trigram_accuracies)
    tri = {'trigram': {'mean': mean_accuracy_trigram, 'sd': standard_deviation_trigram}}
    final_accuracies_list.append(tri)

    mean_accuracy_backoff = mean(backoff_accuracies)
    standard_deviation_backoff = stdev(backoff_accuracies)
    back = {'1, 2, 3-gram backoff': {'mean': mean_accuracy_backoff, 'sd': standard_deviation_backoff}}
    final_accuracies_list.append(back)

    mean_accuracy_tnt = mean(tnt_accuracies)
    standard_deviation_tnt = stdev(tnt_accuracies)
    tnt_score = {'tnt': {'mean': mean_accuracy_tnt, 'sd': standard_deviation_tnt}}
    final_accuracies_list.append(tnt_score)

    final_dict = {}
    for x in final_accuracies_list:
        final_dict.update(x)
    
    return final_dict

示例#9

0

显示文件

文件： Proyecto_ajuste.py 项目： jaidenmeiden/lc

accuracys = []
accuracysSolo = []


# In[ ]:


for i in range(-1,-5,-1):
    #SIN TNT
    affix_tagger = AffixTagger(train=train_reducido,affix_length=i)
    evalSolo = affix_tagger.evaluate(test_reducido)
    accuracysSolo.append(evalSolo)
    print("Suavizado Solo con  Affix_Length = ",i," Accuracy: ", evalSolo)

    #CON TNT
    tnt_tagging =  tnt.TnT(unk=affix_tagger,Trained=True)
    tnt_tagging.train(train_reducido)
    evaluacion = tnt_tagging.evaluate(test_reducido)
    accuracys.append(evaluacion)
    print("TnT Con suavizado Affix_Length = ",i," Accuracy: ", evaluacion)


# In[ ]:


print(accuracys)


# In[ ]:

示例#10

0

显示文件

文件： tagger.py 项目： nepali-bhasa/pos-tagger

def nepali_model():
    data_path = os.path.join(os.getcwd(), 'data/nepali.pos')
    train_data = indian.tagged_sents(data_path)
    tnt_pos_tagger = tnt.TnT()
    tnt_pos_tagger.train(train_data)
    return tnt_pos_tagger

示例#11

0

显示文件

from nltk.tag import tnt, pos_tag
from nltk.corpus import indian
from nltk.tokenize import TreebankWordTokenizer
import codecs
from googletrans import Translator
import re
import pandas as pd

translator = Translator()
train_data = indian.tagged_sents('hindi.pos')
t = tnt.TnT()
t.train(train_data)
filename = "text_regional.txt"

with codecs.open(filename, encoding="utf-8") as file:
    data = file.read()
with codecs.open("final_stopwords.txt", encoding="utf-8") as file:
    stopwordstext = file.read()

#stopwords=list(stopwords.words('hindi'))
stopwords = stopwordstext.split("\n")

words = [
    w for w in TreebankWordTokenizer().tokenize(data)
    if w not in stopwords and w not in ["|", ".", ","]
]
tags = t.tag(words)
tag_list = []
for i, j in tags:
    if j == "Unk":
        tr = translator.translate(i)

示例#12

0

显示文件

文件： PosTagger.py 项目： nidhinsai/Sentimental-Analysis

from src.appconfig import ApplicationConfig

import nltk
from nltk.tag import tnt
import pickle

#---------------------------------------Global Variables
TNT_POS_TAGGER = tnt.TnT()

#-------------------------------------------------------#


def loadPOSTaggerModelFromDisk():
    fileRef = open(ApplicationConfig.POS_TAGGER_MODEL_PICKLE_PATH, "rb")
    tnt_pos_tagger = pickle.load(fileRef)
    fileRef.close()
    return tnt_pos_tagger


def performPOSTagging(tokenList):
    #tokenList = nltk.word_tokenize("അകത്തി അടിമയായിത്തീരുക നശിക്കുക അരികെ പാളി ഉച്ചത്തിലുള്ള യായിത്")
    tnt_pos_tagger = loadPOSTaggerModelFromDisk()
    taggedOutput = tnt_pos_tagger.tag(tokenList)
    return taggedOutput


# ------------------------- TESTING MODULE ------------#


def testPosTagging():
    tokenList = nltk.word_tokenize(

示例#13

0

显示文件

文件： tnt_tagger.py 项目： neuroph12/nlpy

from nltk.tag import tnt, RegexpTagger, DefaultTagger
from tag_util import train_sents, test_sents, patterns

tnt_tagger = tnt.TnT()
tnt_tagger.train(train_sents)
print(tnt_tagger.evaluate(test_sents))
# 0.875631340384

# deal with unknown tokens
default_tagger = DefaultTagger('NN')
unk_tagger = RegexpTagger(patterns, backoff=default_tagger)

tnt_tagger2 = tnt.TnT(unk=unk_tagger, Trained=True)
tnt_tagger2.train(train_sents)
print(tnt_tagger2.evaluate(test_sents))
# 0.896956615584

示例#14

0

显示文件

def train_hindi_model(model_path):
    train_data = indian.tagged_sents(model_path)
    tnt_pos_tagger = tnt.TnT()
    tnt_pos_tagger.train(train_data)
    return tnt_pos_tagger

示例#15

0

显示文件

文件： ch4_28.py 项目： lesimor/nlp_python

import nltk
from nltk.tag import DefaultTagger
from nltk.tag import tnt
from nltk.corpus import treebank
testing = treebank.tagged_sents()[2000:]
training= treebank.tagged_sents()[:7000]
tnt_tagger=tnt.TnT()
unknown=DefaultTagger('NN')
tagger_tnt=tnt.TnT(unk=unknown,Trained=True)
tnt_tagger.train(training)
print(tnt_tagger.evaluate(testing))

示例#16

0

显示文件

文件： mapped_news.py 项目： K-Phillips/CompLing2

# for loop splits text into sentences
training_sentences = []
sentence = []

for item in line_list:
    if '<sentence' in item[0]:
        sentence = []
    elif '/sentence' in item[0]:
        training_sentences.append(sentence)
    else:
        mapped_tag = mapping.get(item[2], 'ERROR')
        sentence.append((item[0], mapped_tag))

# initialize and train tagger
print('Training tagger...')
tnt_tagger = tnt.TnT()
tnt_tagger.train(training_sentences)

# Import Swahili development and testing data
print('Parsing test/dev data...')
old_books = open('hcs2_new_news.vrt', 'r').readlines()

# Parse Swahili development and testing files
POS_test_dev = [old_books]

# training and dev sets must be split by text first
# not just sentences like the training data
all_texts = []
text = []

for lines in POS_test_dev:

示例#17

0

显示文件

文件： act2.py 项目： raruidol/MIARFID

from nltk.tag import hmm
from nltk.tag import tnt
import pickle
import numpy

with open('test', 'rb') as fp:
    test = pickle.load(fp)

with open('train', 'rb') as fp:
    train = pickle.load(fp)

# Entrenamiento del etiquetador
#tagger_hmm = hmm.HiddenMarkovModelTagger.train(train)
tagger_tnt = tnt.TnT()
tagger_tnt.train(train)

# Evaluación del etiquetador
#print(tagger_hmm.evaluate(test))
print(tagger_tnt.evaluate(test))

# Etiquetado de palabras del conjunto de test
words = []
correct = []
for sentence in test:
    for word in sentence:
        words.append(word[0])
        correct.append(word)

#t = tagger_hmm.tag(words)
t = tagger_tnt.tag(words)

示例#18

0

显示文件

def evaluateTnT():
    tagger = tnt.TnT()
    train = int(len(fsents) * 0.9)
    tagger.train(rsents[:train])
    precisionTnT = tagger.evaluate(rsents[train:])
    print(precisionTnT)

示例#19

0

显示文件

def nepali_model():
    train_data = indian.tagged_sents('<path/to/nepali.pos>')
    tnt_pos_tagger = tnt.TnT()
    tnt_pos_tagger.train(train_data)
    return tnt_pos_tagger

示例#20

0

显示文件

listeval = []
intervals = []
for iter in range(10):
    test = bloques[iter]
    train = []
    for element in bloques:
        if element != test:
            for item in element:
                train.append(item)

    # Affix tagger
    suffix_tagger = nltk.tag.AffixTagger(train=train, affix_length=-2)

    # Entrenamiento del etiquetador
    tagger_tnt = tnt.TnT(N=100, unk=suffix_tagger, Trained=True)
    tagger_tnt.train(train)

    # Evaluación del etiquetador
    v = tagger_tnt.evaluate(test)

    d = 1.96 * math.sqrt((v * (1 - v)) / len(np.array(test).flatten()) / 2)
    ic = [round(v - d, 3), round(v + d, 3)]

    listeval.append(round(v, 3))
    intervals.append(ic)

v = 0
for val in listeval:
    v += val
va = v / 10

示例#21

0

显示文件

文件： POS_tagging.py 项目： DulanjanaYasara/chatbot

from nltk.corpus import treebank

train_data = treebank.tagged_sents()[:3]
test_data = treebank.tagged_sents()[300:400]
print test_data[0]

from nltk.tag import tnt
tnt_post_tagger = tnt.TnT()
tnt_post_tagger.train(train_data)
print tnt_post_tagger.evaluate(test_data)

# see http://textminingonline.com/dive-into-nltk-part-iii-part-of-speech-tagging-and-pos-tagger

示例#22

0

显示文件

def demo3():
    from nltk import tag
    from nltk.corpus import treebank, brown
    from nltk.tag import tnt

    d = list(treebank.tagged_sents())
    e = list(brown.tagged_sents())

    d = d[:1000]
    e = e[:1000]

    d10 = int(len(d)*0.1)
    e10 = int(len(e)*0.1)

    tknacc = 0
    sknacc = 0
    tallacc = 0
    sallacc = 0
    tknown = 0
    sknown = 0

    for i in range(10):

        t = tnt.TnT(N=1000, C=False)
        s = tnt.TnT(N=1000, C=False)

        dtest = d[(i*d10):((i+1)*d10)]
        etest = e[(i*e10):((i+1)*e10)]

        dtrain = d[:(i*d10)] + d[((i+1)*d10):]
        etrain = e[:(i*e10)] + e[((i+1)*e10):]

        t.train(dtrain)
        s.train(etrain)

        tacc = tag.accuracy(t, dtest)
        tp_un = float(t.unknown) / float(t.known +t.unknown)
        tp_kn = float(t.known) / float(t.known + t.unknown)
        tknown += tp_kn
        t.unknown = 0
        t.known = 0

        sacc = tag.accuracy(s, etest)
        sp_un = float(s.unknown) / float(s.known + s.unknown)
        sp_kn = float(s.known) / float(s.known + s.unknown)
        sknown += sp_kn
        s.unknown = 0
        s.known = 0

        tknacc += (tacc / tp_kn)
        sknacc += (sacc / tp_kn)
        tallacc += tacc
        sallacc += sacc

        #print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc


    print "brown: acc over words known:", 10 * tknacc
    print "     : overall accuracy:", 10 * tallacc
    print "     : words known:", 10 * tknown
    print "treebank: acc over words known:", 10 * sknacc
    print "        : overall accuracy:", 10 * sallacc
    print "        : words known:", 10 * sknown

示例#23

0

显示文件

文件： feature_generation.py 项目： suman101112/online-hate-speech-recog

    def __init__(self,lang='en'):
        self.lang = lang
        self.stopwords = None
        self.stemmer = None
        self.sentiment_analyzer = None
        self.text_processor = None        
        INDIC_NLP_RESOURCES=r"../model/indic_nlp_resources/"        
        common.set_resources_path(INDIC_NLP_RESOURCES)
        self.pos_tagger = None



        if lang == 'hi':
            self.ht = HindiTokenizer.Tokenizer()
            self.sentiment_analyzer = load_learner(path="../model/hi-sentiment")
            self.stopwords = [x.strip() for x in open("../data/stopwords.txt").readlines()]	
            other_exclusions = ["#ff", "ff", "rt"]
            self.stopwords.extend(other_exclusions)
            self.stemmer = None
            self.text_processor = TextPreProcessor(
                # terms that will be normalized
                normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'number'],
                # terms that will be annotated
                annotate={"hashtag", "allcaps", "elongated", "repeated",
                    'emphasis', 'censored'},
                fix_html=True,  # fix HTML tokens
            )
            loader.load()
            train_data = indian.tagged_sents('hindi.pos')
            self.tnt_pos_tagger = tnt.TnT()
            self.tnt_pos_tagger.train(train_data)

        if lang == 'en':
            self.sentiment_analyzer = VS()
            self.stopwords = nltk.corpus.stopwords.words("english")
            other_exclusions = ["#ff", "ff", "rt"]
            self.stopwords.extend(other_exclusions)
            self.stemmer = PorterStemmer()
            self.text_processor = TextPreProcessor(
                # terms that will be normalized
                normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                    'time', 'url', 'date', 'number'],
                # terms that will be annotated
                annotate={"hashtag", "allcaps", "elongated", "repeated",
                    'emphasis', 'censored'},
                fix_html=True,  # fix HTML tokens

                # corpus from which the word statistics are going to be used 
                # for word segmentation 
                segmenter="twitter", 

                # corpus from which the word statistics are going to be used 
                # for spell correction
                corrector="twitter", 

                unpack_hashtags=True,  # perform word segmentation on hashtags
                unpack_contractions=True,  # Unpack contractions (can't -> can not)
                spell_correct_elong=False,  # spell correction for elongated words

                # select a tokenizer. You can use SocialTokenizer, or pass your own
                # the tokenizer, should take as input a string and return a list of tokens
                tokenizer=SocialTokenizer(lowercase=True).tokenize,

                # list of dictionaries, for replacing tokens extracted from the text,
                # with other expressions. You can pass more than one dictionaries.
                dicts=[emoticons,slang]
            )