def test_senna_tagger(self): tagger = SennaTagger(SENNA_EXECUTABLE_PATH) result = tagger.tag('What is the airspeed of an unladen swallow ?'.split()) expected = [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'),('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'NN'), ('?', '.')] self.assertEqual(result, expected)
class Senna(): '''SennaTagger''' def __init__(self, path='senna', **kwargs): self.__dict__.update(kwargs) if not os.path.isabs(path): current_dir = os.path.dirname(os.path.abspath(__file__)) path = os.path.join(current_dir, path) paths = ( path, os.path.join(sys.exec_prefix, r'lib\site-packages', 'senna'), os.path.join(MODULEDIR, 'bin', 'senna') ) for path in paths: if os.path.exists(path): break else: raise FileNotFoundError(paths) self.tagger = SennaTagger(path, **kwargs) def __call__(self, tokens): return self.tagger.tag(tokens)
def test_senna_tagger(self): tagger = SennaTagger(SENNA_EXECUTABLE_PATH) result = tagger.tag( 'What is the airspeed of an unladen swallow ?'.split()) expected = [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'NN'), ('?', '.')] self.assertEqual(result, expected)
def test_senna_tagger(self): tagger = SennaTagger(SENNA_EXECUTABLE_PATH) result = tagger.tag("What is the airspeed of an unladen swallow ?".split()) expected = [ ("What", "WP"), ("is", "VBZ"), ("the", "DT"), ("airspeed", "NN"), ("of", "IN"), ("an", "DT"), ("unladen", "NN"), ("swallow", "NN"), ("?", "."), ] self.assertEqual(result, expected)
class CRF_Extractor: ''' extract features for the CRF model each line is a feature vector for a token ''' def __init__(self): ''' if phrase_dict_json != None: extract the phrase features if subtype_flag = True, extract the features by sub parse_type if bioe_flag = True, use the BIOE tags ''' self.features = [ 'pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color' ] if 'pos' in self.features: self.pos_tagger = SennaTagger(global_params.sennadir) if 'chunk' in self.features: self.chunk_tagger = SennaChunkTagger(global_params.sennadir) self.sentences = [] self.porter = PorterStemmer() self.token_dict = None self.bins = 50 def add_sentence(self, sentence): self.sentences.append(sentence) def get_token_tf(self): self.token_dict = defaultdict(float) for tokens, _, _ in self.sentences: for token in self.porter.stem_tokens(tokens): self.token_dict[token] += 1.0 self.rank_dict = defaultdict(int) rank_tokens = sorted(self.token_dict, key=self.token_dict.get, reverse=True) self.rank_dict = defaultdict(int) for i, token in enumerate(rank_tokens): self.rank_dict[token] = int(i * 10 / len(rank_tokens)) for t, v in self.token_dict.items( ): #normalized by the number of sentences x = v / len(self.sentences) if x > 1.0: x = 1.0 self.token_dict[t] = x def get_feature_names(self): return '_'.join(self.features) def get_i_j(self, body, i, j): ''' return the value of the crf template feature u[i, j] intput: body: [][], two-dimentionary array, representing the crf features for a sentence i: int, the index of i j: int, the index of j ''' n = len(body) if i < 0: v = '_x%d' % (i) elif i >= n: v = '_x+%d' % (i - n + 1) else: v = body[i][j] return v def extract_U_i_j(self, data_body, feature_body, i, j, tag): ''' extract the U[i, j] feature, and add it to the end of each row intput: data_body: [][], two-dimentionary array, representing the crf data for a sentence feature_body: [][], two-dimentionary array, the resulting feature data for a sentence i: int, the index of i j: int, the index of j tag: the prefix of the feature name ''' for k, row in enumerate(feature_body): row.append('%s:%s' % (tag, self.get_i_j(data_body, k + i, j))) def extract_U_i_j_m_n(self, data_body, feature_body, i, j, m, n, tag): ''' extract the U[i, j]/U[m, n] feature, and add it to the end of each row ''' for k, row in enumerate(feature_body): row.append('%s:%s/%s' % (tag, self.get_i_j( data_body, k + i, j), self.get_i_j(data_body, k + m, n))) def extract_U_i_j_m_n_x_y(self, data_body, feature_body, i, j, m, n, x, y, tag): ''' extract the U[i, j]/U[m, n]/U[x,y] feature, and add it to the end of each row ''' for k, row in enumerate(feature_body): row.append('%s:%s/%s/%s' % (tag, self.get_i_j( data_body, k + i, j), self.get_i_j( data_body, k + m, n), self.get_i_j(data_body, k + x, y))) def extract_word_U_i_j(self, data_body, index, feature_body, i, j, tag): ''' extract the U[i, j] feature, and add it to the end of each row ''' for k, row in zip(index, feature_body): row.append('%s:%s' % (tag, self.get_i_j(data_body, k + i, j))) def extract_word_U_i_j_m_n(self, data_body, index, feature_body, i, j, m, n, tag): ''' extract the U[i, j]/U[m, n] feature, and add it to the end of each row ''' for k, row in zip(index, feature_body): row.append('%s:%s/%s' % (tag, self.get_i_j( data_body, k + i, j), self.get_i_j(data_body, k + m, n))) def extract_word_U_i_j_m_n_x_y(self, data_body, index, feature_body, i, j, m, n, x, y, tag): ''' extract the U[i, j]/U[m, n] feature, and add it to the end of each row ''' for k, row in zip(index, feature_body): row.append('%s:%s/%s/%s' % (tag, self.get_i_j( data_body, k + i, j), self.get_i_j( data_body, k + m, n), self.get_i_j(data_body, k + x, y))) def extract_bigram(self, body): ''' extract the bigram feature for the crf template ''' for row in body: row.append('b') def extract_crf_features(self, tokens, tags, prompt, colors=None): ''' Extract the character features, each token a line return: [][], two dimentionary array, representing the feature data of the sentence ''' body = [] words = tokens N = len(tokens) #first row: the word token for word in words: row = [] row.append(word) body.append(row) if 'pos' in self.features: pos_tags = self.pos_tagger.tag(tokens) for i, (_, p_tag) in enumerate(pos_tags): body[i].append(p_tag) if 'chunk' in self.features: chunk_tags = self.chunk_tagger.tag(tokens) for i, (_, c_tag) in enumerate(chunk_tags): body[i].append(c_tag) if 'promptword' in self.features: for i, token in enumerate(tokens): if token in prompt_words[prompt]: body[i].append('Y') else: body[i].append('N') if 'stopword' in self.features: for i, token in enumerate(tokens): if token in stopwords: body[i].append('Y') else: body[i].append('N') if 'tf' in self.features: if self.token_dict == None: self.get_token_tf() for i, token in enumerate(self.porter.stem_tokens(tokens)): assert (token in self.token_dict) x = int(self.token_dict[token] * self.bins) body[i].append(str(x)) if 'rank' in self.features: if self.rank_dict == None: self.get_token_tf() for i, token in enumerate(self.porter.stem_tokens(tokens)): assert (token in self.rank_dict) x = self.rank_dict[token] body[i].append(str(x)) if 'color' in self.features and colors != None: for color in colors: for i, tag in enumerate(tags): body[i].append(str(color[i])) #last row: tags = [tag for tag in tags] for i, tag in enumerate(tags): body[i].append(tag) return body
import sys from nltk.tokenize import sent_tokenize, word_tokenize from nltk.tag import SennaTagger tagger = SennaTagger('/usr/share/senna-v2.0') argv = sys.argv text = open(argv[1] ,'r').read() sentence = sent_tokenize(text) count = 0 for part in sentence: count += 1 if count < 2: word = tagger.tag(part.split()) for i in word: print(i)
#list.append flist.append(ifile) else: print("error : file does not exit") cnt -= 1 #link file for i in flist: #open file f = open(i, "r") #load and link file linked_file += f.read() #close file f.close #split file splited_file = nltk.sent_tokenize(linked_file) #tagging 1 sentence [print(x) for x in tagger.tag(splited_file[0].split())] """ #tagging all sentence #tagging and output for i in splited_file: [print(x) for x in tagger.tag(i.split())] """
class QualityPrediction: def __init__(self, config): ''' learning_algorithm -- the algorithm to train with (default "SVM") ''' self.training_file = config.get('model', 'train') self.learning_algorithm = config.get('model', 'classify') self.features = config.get('model', 'features').split(',') #print self.features self.course = config.get('model', 'course') self.test_file = '../data/' + self.course + '.json' self._model = None if 'pos' in self.features: self.tagger = SennaTagger(config.get('model', 'senna')) if 'content' in self.features: self.contentwords = [line.strip().lower() for line in open(config.get('model', 'content')).readlines()] if 'organization' in self.features: self.orgnizationwords = [line.strip().lower() for line in open(config.get('model', "organization")).readlines()] featuresets = self._get_training_data() self._train_classifier_model(featuresets) def evaluate(self): test_featureset = self._get_featuresets(self.test_file) labels = [int(x[1]) for x in test_featureset] featureset = [x[0] for x in test_featureset] predicts = [int(x) for x in self._model.classify_many(featureset)] metric = Metric() return metric.accuracy(labels, predicts), metric.kappa(labels, predicts), metric.QWkappa(labels, predicts) def get_features(self, text, cid, lecture): features = {} #unigram tokens = nltk.word_tokenize(text) if 'WC' in self.features: features['WC'] = len(tokens) if 'unigram' in self.features: for token in tokens: features['U0_'+token.lower()] = 1 if 'pos' in self.features: tags = self.tagger.tag(tokens) for _, tag in tags: features['P0_'+tag] = 1 if 'content' in self.features: hasContentWord = 0 for word in tokens: if word.lower() in self.contentwords: hasContentWord = 1 break features['C0_'] = hasContentWord if 'organization' in self.features: OrgAssign = 0 for word in tokens: if word.lower() in self.orgnizationwords: OrgAssign = 1 break features['O0_'] = OrgAssign return features def get_model(self): """An accessor method for the model.""" return self._model def _get_featuresets(self, input): featuresets = [] MPLectures = file_util.LoadDictJson(input) for week, MPs in enumerate(MPLectures): if MPs == []: continue for k, (MP, score) in enumerate(MPs): features = self.get_features(MP, week, 'Engineer') featuresets.append((features,score)) return featuresets def _get_training_data(self): """Builds and returns positive and negative feature sets for the algorithm """ featuresets = self._get_featuresets(self.training_file) return featuresets def _train_classifier_model(self, featuresets): """This changes the algorithm that nltk uses to train the model. Arguments: featuresets -- array of features generated for training """ model = None if(self.learning_algorithm == "NB"): model = nltk.NaiveBayesClassifier.train(featuresets) elif(self.learning_algorithm == "MaxEnt"): model = nltk.MaxentClassifier.train(featuresets, "MEGAM", max_iter=15) elif(self.learning_algorithm == "DecisionTree"): model = nltk.DecisionTreeClassifier.train(featuresets, 0.05) elif(self.learning_algorithm == 'SVM'): model = SklearnClassifier(SVC(kernel='linear')).train(featuresets) self._model = model def predict(self, text, cid=None, lecture=None): features = self.get_features(text, cid, lecture) return self._model.classify(features)
import sys from nltk.tokenize import word_tokenize, sent_tokenize from nltk.tag import SennaTagger argv = sys.argv sent_tokenized = sent_tokenize(open(argv[1]).read()) word_tokenized = word_tokenize(sent_tokenized[0]) tagger = SennaTagger('/usr/share/senna-v2.0') for a,b in tagger.tag(word_tokenized): print(b,"\11",a)
#!/usr/bin/env python #-*- coding: utf-8 -*- import sys import nltk from nltk.tokenize import sent_tokenize, word_tokenize from nltk.tag import SennaTagger argvs = sys.argv argc = len(argvs) #引数が不正の場合はメッセージを表示する if (argc != 2): print('Usage: # python %s filename' % argvs[0]) quit() #タガー準備 tagger = SennaTagger('/usr/share/senna-v2.0') #文分割 openedFile = open(argvs[1]).read() sent_tokenize_list = sent_tokenize(openedFile) #1行目の単語分割 word_tokenize_list = word_tokenize(sent_tokenize_list[0]) #タグ付け for w, t in tagger.tag(word_tokenize_list): print(w, t)
import sys from nltk.tokenize import sent_tokenize, word_tokenize from nltk.tag import SennaTagger fr = open(sys.argv[1]).read() sent = sent_tokenize(fr) aword = word_tokenize(sent[0]) tagger = SennaTagger('/usr/share/senna-v2.0') for w, t in tagger.tag(aword): print(w, t)
class CRF_Extractor: ''' extract features for the CRF model each line is a feature vector for a token ''' def __init__(self): ''' if phrase_dict_json != None: extract the phrase features if subtype_flag = True, extract the features by sub parse_type if bioe_flag = True, use the BIOE tags ''' self.features = ['pos', 'chunk', 'promptword', 'stopword', 'tf', 'rank', 'color'] if 'pos' in self.features: self.pos_tagger = SennaTagger(global_params.sennadir) if 'chunk' in self.features: self.chunk_tagger = SennaChunkTagger(global_params.sennadir) self.sentences = [] self.porter = PorterStemmer() self.token_dict = None self.bins = 50 def add_sentence(self, sentence): self.sentences.append(sentence) def get_token_tf(self): self.token_dict = defaultdict(float) for tokens, _, _ in self.sentences: for token in self.porter.stem_tokens(tokens): self.token_dict[token] += 1.0 self.rank_dict = defaultdict(int) rank_tokens = sorted(self.token_dict, key=self.token_dict.get, reverse=True) self.rank_dict = defaultdict(int) for i, token in enumerate(rank_tokens): self.rank_dict[token] = int(i*10/len(rank_tokens)) for t, v in self.token_dict.items(): #normalized by the number of sentences x = v/len(self.sentences) if x > 1.0: x = 1.0 self.token_dict[t] = x def get_feature_names(self): return '_'.join(self.features) def get_i_j(self, body, i, j): ''' return the value of the crf template feature u[i, j] intput: body: [][], two-dimentionary array, representing the crf features for a sentence i: int, the index of i j: int, the index of j ''' n = len(body) if i < 0: v = '_x%d'%(i) elif i >= n: v = '_x+%d'%(i-n+1) else: v = body[i][j] return v def extract_U_i_j(self, data_body, feature_body, i, j, tag): ''' extract the U[i, j] feature, and add it to the end of each row intput: data_body: [][], two-dimentionary array, representing the crf data for a sentence feature_body: [][], two-dimentionary array, the resulting feature data for a sentence i: int, the index of i j: int, the index of j tag: the prefix of the feature name ''' for k, row in enumerate(feature_body): row.append('%s:%s'%(tag, self.get_i_j(data_body, k+i, j))) def extract_U_i_j_m_n(self, data_body, feature_body, i, j, m, n, tag): ''' extract the U[i, j]/U[m, n] feature, and add it to the end of each row ''' for k, row in enumerate(feature_body): row.append('%s:%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n))) def extract_U_i_j_m_n_x_y(self, data_body, feature_body, i, j, m, n, x, y, tag): ''' extract the U[i, j]/U[m, n]/U[x,y] feature, and add it to the end of each row ''' for k, row in enumerate(feature_body): row.append('%s:%s/%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n), self.get_i_j(data_body, k+x, y))) def extract_word_U_i_j(self, data_body, index, feature_body, i, j, tag): ''' extract the U[i, j] feature, and add it to the end of each row ''' for k, row in zip(index, feature_body): row.append('%s:%s'%(tag, self.get_i_j(data_body, k+i, j))) def extract_word_U_i_j_m_n(self, data_body, index, feature_body, i, j, m, n, tag): ''' extract the U[i, j]/U[m, n] feature, and add it to the end of each row ''' for k, row in zip(index, feature_body): row.append('%s:%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n))) def extract_word_U_i_j_m_n_x_y(self, data_body, index, feature_body, i, j, m, n, x, y, tag): ''' extract the U[i, j]/U[m, n] feature, and add it to the end of each row ''' for k, row in zip(index, feature_body): row.append('%s:%s/%s/%s'%(tag, self.get_i_j(data_body, k+i, j), self.get_i_j(data_body, k+m, n), self.get_i_j(data_body, k+x, y))) def extract_bigram(self, body): ''' extract the bigram feature for the crf template ''' for row in body: row.append('b') def extract_crf_features(self, tokens, tags, prompt, colors=None): ''' Extract the character features, each token a line return: [][], two dimentionary array, representing the feature data of the sentence ''' body = [] words = tokens N = len(tokens) #first row: the word token for word in words: row = [] row.append(word) body.append(row) if 'pos' in self.features: pos_tags = self.pos_tagger.tag(tokens) for i, (_, p_tag) in enumerate(pos_tags): body[i].append(p_tag) if 'chunk' in self.features: chunk_tags = self.chunk_tagger.tag(tokens) for i, (_, c_tag) in enumerate(chunk_tags): body[i].append(c_tag) if 'promptword' in self.features: for i, token in enumerate(tokens): if token in prompt_words[prompt]: body[i].append('Y') else: body[i].append('N') if 'stopword' in self.features: for i, token in enumerate(tokens): if token in stopwords: body[i].append('Y') else: body[i].append('N') if 'tf' in self.features: if self.token_dict == None: self.get_token_tf() for i, token in enumerate(self.porter.stem_tokens(tokens)): assert(token in self.token_dict) x = int(self.token_dict[token]*self.bins) body[i].append(str(x)) if 'rank' in self.features: if self.rank_dict == None: self.get_token_tf() for i, token in enumerate(self.porter.stem_tokens(tokens)): assert(token in self.rank_dict) x = self.rank_dict[token] body[i].append(str(x)) if 'color' in self.features and colors != None: for color in colors: for i, tag in enumerate(tags): body[i].append(str(color[i])) #last row: tags = [tag for tag in tags] for i, tag in enumerate(tags): body[i].append(tag) return body
import sys from nltk.tokenize import sent_tokenize,word_tokenize from nltk.tag import SennaTagger f=open(sys.argv[1],'r') lines=sent_tokenize(f.read()) tagger=SennaTagger('/usr/share/senna-v2.0') words=word_tokenize(lines[1]) print(tagger.tag(words))