示例#1
0
文件: test_senna.py 项目: DrDub/nltk
 def test_senna_tagger(self):
     tagger = SennaTagger(SENNA_EXECUTABLE_PATH)
     result = tagger.tag('What is the airspeed of an unladen swallow ?'.split())
     expected = [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed',
         'NN'),('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow',
         'NN'), ('?', '.')]
     self.assertEqual(result, expected)
示例#2
0
class Senna():
    '''SennaTagger'''
    
    def __init__(self,
                path='senna',
                **kwargs):   
        
        self.__dict__.update(kwargs)
        
        if not os.path.isabs(path):
            current_dir = os.path.dirname(os.path.abspath(__file__))
            path = os.path.join(current_dir, path)
        
        paths = (
                path,
                os.path.join(sys.exec_prefix, r'lib\site-packages', 'senna'),
                os.path.join(MODULEDIR, 'bin', 'senna')
        )
        
        for path in paths:
            if os.path.exists(path):
               break
        else:
            raise FileNotFoundError(paths) 
         
        self.tagger = SennaTagger(path, **kwargs)
  
    def __call__(self, tokens):
        return  self.tagger.tag(tokens)  
示例#3
0
 def test_senna_tagger(self):
     tagger = SennaTagger(SENNA_EXECUTABLE_PATH)
     result = tagger.tag(
         'What is the airspeed of an unladen swallow ?'.split())
     expected = [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'),
                 ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'),
                 ('unladen', 'NN'), ('swallow', 'NN'), ('?', '.')]
     self.assertEqual(result, expected)
示例#4
0
 def test_senna_tagger(self):
     tagger = SennaTagger(SENNA_EXECUTABLE_PATH)
     result = tagger.tag("What is the airspeed of an unladen swallow ?".split())
     expected = [
         ("What", "WP"),
         ("is", "VBZ"),
         ("the", "DT"),
         ("airspeed", "NN"),
         ("of", "IN"),
         ("an", "DT"),
         ("unladen", "NN"),
         ("swallow", "NN"),
         ("?", "."),
     ]
     self.assertEqual(result, expected)
示例#5
0
class CRF_Extractor:
    '''
    extract features for the CRF model
    each line is a feature vector for a token
    '''
    def __init__(self):
        '''
        if phrase_dict_json != None: extract the phrase features
        if subtype_flag = True, extract the features by sub parse_type
        if bioe_flag = True, use the BIOE tags
        '''
        self.features = [
            'pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color'
        ]

        if 'pos' in self.features:
            self.pos_tagger = SennaTagger(global_params.sennadir)

        if 'chunk' in self.features:
            self.chunk_tagger = SennaChunkTagger(global_params.sennadir)

        self.sentences = []

        self.porter = PorterStemmer()

        self.token_dict = None
        self.bins = 50

    def add_sentence(self, sentence):
        self.sentences.append(sentence)

    def get_token_tf(self):
        self.token_dict = defaultdict(float)

        for tokens, _, _ in self.sentences:
            for token in self.porter.stem_tokens(tokens):
                self.token_dict[token] += 1.0

        self.rank_dict = defaultdict(int)
        rank_tokens = sorted(self.token_dict,
                             key=self.token_dict.get,
                             reverse=True)

        self.rank_dict = defaultdict(int)
        for i, token in enumerate(rank_tokens):
            self.rank_dict[token] = int(i * 10 / len(rank_tokens))

        for t, v in self.token_dict.items(
        ):  #normalized by the number of sentences
            x = v / len(self.sentences)
            if x > 1.0: x = 1.0

            self.token_dict[t] = x

    def get_feature_names(self):
        return '_'.join(self.features)

    def get_i_j(self, body, i, j):
        '''
        return the value of the crf template feature u[i, j]
        intput: 
            body: [][], two-dimentionary array, representing the crf features for a sentence
            i: int, the index of i
            j: int, the index of j
        '''
        n = len(body)
        if i < 0:
            v = '_x%d' % (i)
        elif i >= n:
            v = '_x+%d' % (i - n + 1)
        else:
            v = body[i][j]
        return v

    def extract_U_i_j(self, data_body, feature_body, i, j, tag):
        '''
        extract the U[i, j] feature, and add it to the end of each row
        intput: 
            data_body: [][], two-dimentionary array, representing the crf data for a sentence
            feature_body: [][], two-dimentionary array, the resulting feature data for a sentence
            i: int, the index of i
            j: int, the index of j
            tag: the prefix of the feature name
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s' % (tag, self.get_i_j(data_body, k + i, j)))

    def extract_U_i_j_m_n(self, data_body, feature_body, i, j, m, n, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s/%s' % (tag, self.get_i_j(
                data_body, k + i, j), self.get_i_j(data_body, k + m, n)))

    def extract_U_i_j_m_n_x_y(self, data_body, feature_body, i, j, m, n, x, y,
                              tag):
        '''
        extract the U[i, j]/U[m, n]/U[x,y] feature, and add it to the end of each row
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s/%s/%s' % (tag, self.get_i_j(
                data_body, k + i, j), self.get_i_j(
                    data_body, k + m, n), self.get_i_j(data_body, k + x, y)))

    def extract_word_U_i_j(self, data_body, index, feature_body, i, j, tag):
        '''
        extract the U[i, j] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s' % (tag, self.get_i_j(data_body, k + i, j)))

    def extract_word_U_i_j_m_n(self, data_body, index, feature_body, i, j, m,
                               n, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s/%s' % (tag, self.get_i_j(
                data_body, k + i, j), self.get_i_j(data_body, k + m, n)))

    def extract_word_U_i_j_m_n_x_y(self, data_body, index, feature_body, i, j,
                                   m, n, x, y, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s/%s/%s' % (tag, self.get_i_j(
                data_body, k + i, j), self.get_i_j(
                    data_body, k + m, n), self.get_i_j(data_body, k + x, y)))

    def extract_bigram(self, body):
        '''
        extract the bigram feature for the crf template
        '''
        for row in body:
            row.append('b')

    def extract_crf_features(self, tokens, tags, prompt, colors=None):
        '''
        Extract the character features, each token a line
        return: [][], two dimentionary array, representing the feature data of the sentence
        '''

        body = []

        words = tokens
        N = len(tokens)

        #first row: the word token
        for word in words:
            row = []
            row.append(word)
            body.append(row)

        if 'pos' in self.features:
            pos_tags = self.pos_tagger.tag(tokens)

            for i, (_, p_tag) in enumerate(pos_tags):
                body[i].append(p_tag)

        if 'chunk' in self.features:
            chunk_tags = self.chunk_tagger.tag(tokens)

            for i, (_, c_tag) in enumerate(chunk_tags):
                body[i].append(c_tag)

        if 'promptword' in self.features:
            for i, token in enumerate(tokens):
                if token in prompt_words[prompt]:
                    body[i].append('Y')
                else:
                    body[i].append('N')

        if 'stopword' in self.features:
            for i, token in enumerate(tokens):
                if token in stopwords:
                    body[i].append('Y')
                else:
                    body[i].append('N')

        if 'tf' in self.features:
            if self.token_dict == None:
                self.get_token_tf()

            for i, token in enumerate(self.porter.stem_tokens(tokens)):
                assert (token in self.token_dict)

                x = int(self.token_dict[token] * self.bins)
                body[i].append(str(x))

        if 'rank' in self.features:
            if self.rank_dict == None:
                self.get_token_tf()

            for i, token in enumerate(self.porter.stem_tokens(tokens)):
                assert (token in self.rank_dict)

                x = self.rank_dict[token]
                body[i].append(str(x))

        if 'color' in self.features and colors != None:
            for color in colors:
                for i, tag in enumerate(tags):
                    body[i].append(str(color[i]))

        #last row:
        tags = [tag for tag in tags]

        for i, tag in enumerate(tags):
            body[i].append(tag)

        return body
import sys
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import SennaTagger
tagger = SennaTagger('/usr/share/senna-v2.0')
argv = sys.argv
text = open(argv[1] ,'r').read()
sentence = sent_tokenize(text)
count = 0
for part in sentence:
    count += 1
    if count < 2:
        word = tagger.tag(part.split())
        for i in word:
            print(i)
示例#7
0
        #list.append
        flist.append(ifile)

    else:
        print("error : file does not exit")
        cnt -= 1

#link file
for i in flist:
    #open file
    f = open(i, "r")

    #load and link file
    linked_file += f.read()

    #close file
    f.close

#split file
splited_file = nltk.sent_tokenize(linked_file)

#tagging 1 sentence
[print(x) for x in tagger.tag(splited_file[0].split())]
"""
#tagging all sentence

#tagging and output
for i in splited_file:
    [print(x) for x in tagger.tag(i.split())]
"""
class QualityPrediction:
    def __init__(self, config):
        '''
        learning_algorithm -- the algorithm to train with
            (default "SVM")
        '''
    
        self.training_file = config.get('model', 'train')
        self.learning_algorithm = config.get('model', 'classify')
        self.features = config.get('model', 'features').split(',')
        #print self.features
        
        self.course =  config.get('model', 'course')
        self.test_file = '../data/' + self.course + '.json'
        
        self._model = None
        
        if 'pos' in self.features:
            self.tagger = SennaTagger(config.get('model', 'senna'))
        
        if 'content' in self.features:
            self.contentwords = [line.strip().lower() for line in open(config.get('model', 'content')).readlines()]
        
        if 'organization' in self.features:
            self.orgnizationwords = [line.strip().lower() for line in open(config.get('model', "organization")).readlines()]
        
        featuresets = self._get_training_data()
        self._train_classifier_model(featuresets)
        
    
    def evaluate(self):
        test_featureset = self._get_featuresets(self.test_file)
        
        labels = [int(x[1]) for x in test_featureset]
        featureset = [x[0] for x in test_featureset]
        predicts = [int(x) for x in self._model.classify_many(featureset)]
        
        metric = Metric()
        
        return metric.accuracy(labels, predicts), metric.kappa(labels, predicts), metric.QWkappa(labels, predicts)
    
    def get_features(self, text, cid, lecture):
        features = {}
        
        #unigram
        tokens = nltk.word_tokenize(text)
        
        if 'WC' in self.features:
            features['WC'] = len(tokens)
       
        if 'unigram' in self.features:
            for token in tokens:
                features['U0_'+token.lower()] = 1
        
        if 'pos' in self.features:
            tags = self.tagger.tag(tokens)
            for _, tag in tags:
                features['P0_'+tag] = 1
        
        if 'content' in self.features:
            hasContentWord = 0
            for word in tokens:
                if word.lower() in self.contentwords:
                    hasContentWord = 1
                    break
            features['C0_'] = hasContentWord
        
        if 'organization' in self.features:
            OrgAssign = 0
            for word in tokens:
                if word.lower() in self.orgnizationwords:
                    OrgAssign = 1
                    break
            features['O0_'] = OrgAssign
                    
        return features
        
    def get_model(self):
        """An accessor method for the model."""
        return self._model
    
    def _get_featuresets(self, input):
        featuresets = []
        
        MPLectures = file_util.LoadDictJson(input)
        
        for week, MPs in enumerate(MPLectures):
            if MPs == []: continue
            
            for k, (MP, score) in enumerate(MPs):
                features = self.get_features(MP, week, 'Engineer')
                featuresets.append((features,score))
        
        return featuresets
        
    def _get_training_data(self):
        """Builds and returns positive and negative feature sets
        for the algorithm

        """
        featuresets = self._get_featuresets(self.training_file)
        return featuresets
    
    def _train_classifier_model(self, featuresets):
        """This changes the algorithm that nltk uses to train the model.

        Arguments:
        featuresets -- array of features generated for training

        """
        model = None
        if(self.learning_algorithm == "NB"):
            model = nltk.NaiveBayesClassifier.train(featuresets)
        elif(self.learning_algorithm == "MaxEnt"):
            model = nltk.MaxentClassifier.train(featuresets, "MEGAM",
                                                 max_iter=15)
        elif(self.learning_algorithm == "DecisionTree"):
            model = nltk.DecisionTreeClassifier.train(featuresets, 0.05)
        elif(self.learning_algorithm == 'SVM'):
            model = SklearnClassifier(SVC(kernel='linear')).train(featuresets)
        self._model = model
        
    def predict(self, text, cid=None, lecture=None):
        features = self.get_features(text, cid, lecture)
        return self._model.classify(features)
import sys
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import SennaTagger

argv = sys.argv

sent_tokenized = sent_tokenize(open(argv[1]).read())
word_tokenized = word_tokenize(sent_tokenized[0])

tagger = SennaTagger('/usr/share/senna-v2.0')

for a,b in tagger.tag(word_tokenized):
    print(b,"\11",a)
示例#10
0
#!/usr/bin/env python
#-*- coding: utf-8 -*-

import sys
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import SennaTagger

argvs = sys.argv
argc = len(argvs)

#引数が不正の場合はメッセージを表示する
if (argc != 2):
	print('Usage: # python %s filename' % argvs[0])
	quit()
 
#タガー準備
tagger = SennaTagger('/usr/share/senna-v2.0')

#文分割
openedFile = open(argvs[1]).read()
sent_tokenize_list = sent_tokenize(openedFile)

#1行目の単語分割
word_tokenize_list = word_tokenize(sent_tokenize_list[0])

#タグ付け
for w, t in tagger.tag(word_tokenize_list):
	print(w, t)
import sys
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import SennaTagger

fr = open(sys.argv[1]).read()
sent = sent_tokenize(fr)
aword = word_tokenize(sent[0])

tagger = SennaTagger('/usr/share/senna-v2.0')

for w, t in tagger.tag(aword):
    print(w, t)
class CRF_Extractor:
    '''
    extract features for the CRF model
    each line is a feature vector for a token
    '''
    def __init__(self):
        '''
        if phrase_dict_json != None: extract the phrase features
        if subtype_flag = True, extract the features by sub parse_type
        if bioe_flag = True, use the BIOE tags
        '''
        self.features = ['pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color']
        
        if 'pos' in self.features:
            self.pos_tagger = SennaTagger(global_params.sennadir)
        
        if 'chunk' in self.features:
            self.chunk_tagger = SennaChunkTagger(global_params.sennadir)
        
        self.sentences = []
        
        self.porter = PorterStemmer()
        
        self.token_dict = None
        self.bins = 50
    
    def add_sentence(self, sentence):
        self.sentences.append(sentence)
    
    def get_token_tf(self):
        self.token_dict = defaultdict(float)
        
        for tokens, _, _ in self.sentences:
            for token in self.porter.stem_tokens(tokens):
                self.token_dict[token] += 1.0
        
        self.rank_dict = defaultdict(int)
        rank_tokens = sorted(self.token_dict, key=self.token_dict.get, reverse=True)
        
        self.rank_dict = defaultdict(int)
        for i, token in enumerate(rank_tokens):
            self.rank_dict[token] = int(i*10/len(rank_tokens))
        
        for t, v in self.token_dict.items(): #normalized by the number of sentences
            x = v/len(self.sentences)
            if x > 1.0: x = 1.0
            
            self.token_dict[t] = x
        
    def get_feature_names(self):
        return '_'.join(self.features)
    
    def get_i_j(self, body, i, j):
        '''
        return the value of the crf template feature u[i, j]
        intput: 
            body: [][], two-dimentionary array, representing the crf features for a sentence
            i: int, the index of i
            j: int, the index of j
        '''
        n = len(body)
        if i < 0:
            v = '_x%d'%(i)
        elif i >= n:
            v = '_x+%d'%(i-n+1)
        else:
            v = body[i][j]
        return v
    
    def extract_U_i_j(self, data_body, feature_body, i, j, tag):
        '''
        extract the U[i, j] feature, and add it to the end of each row
        intput: 
            data_body: [][], two-dimentionary array, representing the crf data for a sentence
            feature_body: [][], two-dimentionary array, the resulting feature data for a sentence
            i: int, the index of i
            j: int, the index of j
            tag: the prefix of the feature name
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s'%(tag, self.get_i_j(data_body, k+i, j)))
    
    def extract_U_i_j_m_n(self, data_body, feature_body, i, j, m, n, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n)))
    
    def extract_U_i_j_m_n_x_y(self, data_body, feature_body, i, j, m, n, x, y, tag):
        '''
        extract the U[i, j]/U[m, n]/U[x,y] feature, and add it to the end of each row
        '''
        for k, row in enumerate(feature_body):
            row.append('%s:%s/%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n), self.get_i_j(data_body, k+x, y)))
    
    def extract_word_U_i_j(self, data_body, index, feature_body, i, j, tag):
        '''
        extract the U[i, j] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s'%(tag, self.get_i_j(data_body, k+i, j)))
    
    def extract_word_U_i_j_m_n(self, data_body, index, feature_body, i, j, m, n, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n)))
    
    def extract_word_U_i_j_m_n_x_y(self, data_body, index, feature_body, i, j, m, n, x, y, tag):
        '''
        extract the U[i, j]/U[m, n] feature, and add it to the end of each row
        '''
        for k, row in zip(index, feature_body):
            row.append('%s:%s/%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n), self.get_i_j(data_body, k+x, y)))
            
    def extract_bigram(self, body):
        '''
        extract the bigram feature for the crf template
        '''
        for row in body:
            row.append('b')
    
    def extract_crf_features(self, tokens, tags, prompt, colors=None):
        '''
        Extract the character features, each token a line
        return: [][], two dimentionary array, representing the feature data of the sentence
        '''
    
        body = []

        words = tokens
        N = len(tokens)
        
        #first row: the word token
        for word in words:
            row = []
            row.append(word)
            body.append(row)
        
        if 'pos' in self.features:
            pos_tags = self.pos_tagger.tag(tokens)
            
            for i, (_, p_tag) in enumerate(pos_tags):
                body[i].append(p_tag)
        
        if 'chunk' in self.features:
            chunk_tags = self.chunk_tagger.tag(tokens)
            
            for i, (_, c_tag) in enumerate(chunk_tags):
                body[i].append(c_tag)
        
        if 'promptword' in self.features:
            for i, token in enumerate(tokens):
                if token in prompt_words[prompt]:
                    body[i].append('Y')
                else:
                    body[i].append('N')
        
        if 'stopword' in self.features:
            for i, token in enumerate(tokens):
                if token in stopwords:
                    body[i].append('Y')
                else:
                    body[i].append('N')
        
        if 'tf' in self.features:
            if self.token_dict == None:
                self.get_token_tf()
            
            for i, token in enumerate(self.porter.stem_tokens(tokens)):
                assert(token in self.token_dict)
                
                x = int(self.token_dict[token]*self.bins)
                body[i].append(str(x))
        
        if 'rank' in self.features:
            if self.rank_dict == None:
                self.get_token_tf()
            
            for i, token in enumerate(self.porter.stem_tokens(tokens)):
                assert(token in self.rank_dict)
                
                x = self.rank_dict[token]
                body[i].append(str(x))        
        
        if 'color' in self.features and colors != None:
            for color in colors:
                for i, tag in enumerate(tags):
                    body[i].append(str(color[i]))
        
        #last row:
        tags = [tag for tag in tags]
        
        for i, tag in enumerate(tags):
            body[i].append(tag)
        
        return body
import sys
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.tag import SennaTagger
f=open(sys.argv[1],'r')
lines=sent_tokenize(f.read())
tagger=SennaTagger('/usr/share/senna-v2.0')
words=word_tokenize(lines[1])    
print(tagger.tag(words))