def __init__(self, text, product_name): self.candidate_features = [] self.feature_sentences = [] self.product_name = product_name.lower().split('-')[0].split('_') t = Tokenizer() sents = t.sent_tokenize(text.lower()) p = POSTagger() wnl = WordNetLemmatizer() for sent in sents: tagged_sent = p.nltk_tag(t.word_tokenize(sent)) feature_sent = {} feature_sent['sentence'] = sent feature_sent['tags'] = tagged_sent feature_sent['nouns'] = [] feature_sent['noun_phrases'] = [] for i in range(0, len(tagged_sent)): (word, tag) = tagged_sent[i] #Don't include proper nouns if tag.startswith('N') and tag != 'NNP': """ Consecutive nouns might form a feature phrase. Eg. Picture quality is a phrase. Meaningless phrases like 'quality digital' are removed later as their frequeny of occurence is low. """ if i > 0 and len( feature_sent['nouns'] ) > 0 and tagged_sent[i - 1][0] == feature_sent['nouns'][ -1] and feature_sent['sentence'].find( feature_sent['nouns'][-1] + ' ' + word) > -1: feature_sent['noun_phrases'].append( wnl.lemmatize(feature_sent['nouns'].pop() + ' ' + word)) else: feature_sent['nouns'].append(wnl.lemmatize(word)) self.feature_sentences.append(feature_sent)
def init_feature_sentences(self, total_content): t = Tokenizer() p = POSTagger() wnl = WordNetLemmatizer() sentences = t.sent_tokenize(total_content.lower()) for sentence in sentences: tagged_sentence = p.ntlk_tag(t.word_tokenize(sentence)) #Initializing Feature Sentence dictionary feature_sentence = {} feature_sentence['sentence'] = sentence feature_sentence['tags'] = tagged_sentence feature_sentence['nouns'] = [] feature_sentence['noun_phrases'] = [] #Finding the Nouns/Noun Phrases in the tagged sentence for i in range(0,len(tagged_sentence)): (word, tag) = tagged_sentence[i] #Chunking if tag.startswith('N') and tag != 'NNP': if i > 0 and len(feature_sentence['nouns']) > 0 and tagged_sentence[i - 1][0] == feature_sentence['nouns'][-1] and feature_sentence['sentence'].find(feature_sentence['nouns'][-1] + ' ' + word) > -1: feature_sentence['noun_phrases'].append(wnl.lemmatize(feature_sentence['nouns'].pop() + ' ' + word)) else: feature_sentence['nouns'].append(wnl.lemmatize(word)) self.feature_sentences.append(feature_sentence)
def train_tagger(): #Train on brown corpus tagger = POSTagger(simplify=True) tagger.train(support_cutoff=2) nums = tagger.word_features.unique_values() return tagger
def __init__(self, text, product_name): self.candidate_features = [] self.feature_sentences = [] self.product_name = product_name.lower().split('-')[0].split('_') t = Tokenizer() sents = t.sent_tokenize(text.lower()) p = POSTagger() wnl = WordNetLemmatizer() for sent in sents: tagged_sent = p.nltk_tag(t.word_tokenize(sent)) feature_sent = {} feature_sent['sentence'] = sent feature_sent['tags'] = tagged_sent feature_sent['nouns'] = [] feature_sent['noun_phrases'] = [] for i in range(0, len(tagged_sent)): (word, tag) = tagged_sent[i] #Don't include proper nouns if tag.startswith('N') and tag != 'NNP': """ Consecutive nouns might form a feature phrase. Eg. Picture quality is a phrase. Meaningless phrases like 'quality digital' are removed later as their frequeny of occurence is low. """ if i > 0 and len(feature_sent['nouns']) > 0 and tagged_sent[i - 1][0] == feature_sent['nouns'][-1] and feature_sent['sentence'].find(feature_sent['nouns'][-1] + ' ' + word) > -1: feature_sent['noun_phrases'].append(wnl.lemmatize(feature_sent['nouns'].pop() + ' ' + word)) else: feature_sent['nouns'].append(wnl.lemmatize(word)) self.feature_sentences.append(feature_sent)
class TestPyPOS(unittest.TestCase): def setUp(self): '''Tests Setup''' self.lexer = Lexer() self.tagger = POSTagger() self.start = time.time() def stringTest(self,string): '''Common Testing Function''' self.words = self.lexer.lex(string) self.tags = self.tagger.tag(self.words) self.end = time.time() self.difference = self.end - self.start for tag in self.tags: print " / ".join(tag) def test_1_Short(self): '''Test Short String''' global shortTestString self.stringTest(shortTestString) def test_2_Long(self): '''Test Long String''' global testString self.stringTest(testString) def tearDown(self): print "Tokenized and tagged %s words in %s seconds" % (len(self.words),self.difference) print "Running time at test end was: %s seconds" % (time.time() - STARTTIME)
class TestPyPOS(unittest.TestCase): def setUp(self): '''Tests Setup''' self.lexer = Lexer() self.tagger = POSTagger() self.start = time.time() def stringTest(self, string): '''Common Testing Function''' self.words = self.lexer.lex(string) self.tags = self.tagger.tag(self.words) self.end = time.time() self.difference = self.end - self.start for tag in self.tags: print " / ".join(tag) def test_1_Short(self): '''Test Short String''' global shortTestString self.stringTest(shortTestString) def test_2_Long(self): '''Test Long String''' global testString self.stringTest(testString) def tearDown(self): print "Tokenized and tagged %s words in %s seconds" % (len( self.words), self.difference) print "Running time at test end was: %s seconds" % (time.time() - STARTTIME)
def __init__(self, text): self.candidate_features = [] self.feature_sentences = [] t = Tokenizer() sents = t.sent_tokenize(text) p = POSTagger() for sent in sents: tagged_sent = p.nltk_tag(t.nltk_tokenize(sent)) feature_sent = {} feature_sent['sentence'] = sent feature_sent['nouns'] = [] feature_sent['noun_phrases'] = [] for i in range(0, len(tagged_sent)): (word, tag) = tagged_sent[i] if tag.startswith('N') and tag != 'NNP': if i > 0 and len(feature_sent['nouns']) > 0 and tagged_sent[i - 1][0] == feature_sent['nouns'][-1]: feature_sent['noun_phrases'].append(feature_sent['nouns'].pop() + ' ' + word) else: feature_sent['nouns'].append(word) self.feature_sentences.append(feature_sent)
def evalTaggerLRT_w1_s1(): feature_set = ['t_i-1,i+1','t_i-1', 't_i+1', 'w_i'] tagger = POSTagger(1) tagger.fixedEval('LR',feature_set, 1)
def evalTaggerR_w1_s1(): feature_set = ['t_i+1', 'w_i'] tagger = POSTagger(1) tagger.fixedEval('R',feature_set, 1)
def evalTaggerL_w3_s1(): feature_set = ['t_i-1', 'w_i-1', 'w_i', 'w_i+1'] tagger = POSTagger(1) tagger.fixedEval('L',feature_set, 1)
def evalTagger(N): feature_set = ['t_i-1', 'w_i'] support_cutoff = 2 tagger = POSTagger(simplify=True, tag_cutoff=1) tagger.randEvalL(N, feature_set, support_cutoff) return tagger
def setUp(self): '''Tests Setup''' self.lexer = Lexer() self.tagger = POSTagger() self.start = time.time()
def exec_segment(content): try: content = str(content, 'utf-8') except TypeError: # print(TypeError) pass # 在這裡要先把要更換的字替換掉 for word in map_word: map_to = map_word[word] content = content.replace(word, map_to) # 把換行換掉 content = content.replace("\n", " ") content = content.strip() seg_list = [] if mode == "exact": seg_list = jieba.cut(content, cut_all=False) elif mode == "all": seg_list = jieba.cut(content, cut_all=True) elif mode == "search": seg_list = jieba.cut_for_search(content) elif mode == "mix": temp_seg_list = jieba.cut_for_search(content) for s in temp_seg_list: seg_list.append(s) temp_seg_list = jieba.cut(content, cut_all=True) temp_seg_list = cut_result_to_list(temp_seg_list) for j, t in enumerate(temp_seg_list): t = temp_seg_list[(len(temp_seg_list) - j - 1)] if list_index_of(seg_list, t) == -1: # 如果找不到這個字...再來決定要插入在那個位置 found = False for i, s in enumerate(seg_list): if in_string(t, s): # 位置在i if len(s) > len(t): i = i + 1 seg_list.insert(i, t) found = True break if found == False: seg_list.append(t) temp_seg_list = jieba.cut(content, cut_all=False) temp_seg_list = cut_result_to_list(temp_seg_list) for j, t in enumerate(temp_seg_list): t = temp_seg_list[(len(temp_seg_list) - j - 1)] if list_index_of(seg_list, t) == -1: # 如果找不到這個字...再來決定要插入在那個位置 found = False for i, s in enumerate(seg_list): if in_string(t, s): # 位置在i if len(s) > len(t): i = i + 1 seg_list.insert((i + 1), t) found = True break if found == False: seg_list.append(t) else: seg_list = jieba.cut(content, cut_all=False) seg_list_filtered = [] pos_tag_list = [] seg_list_filtered_count = 0 distinct_words = {} distinct_pos = {} for s in seg_list: if s.strip() == "": continue try: stopword_index = stopwords.index(s) except ValueError: p = [] if enable_pos_tag == "true": words = pseg.cut(s) s = [] p = [] for word, flag in words: if isEnglish(word): flag = "eng" if flag != "eng": if word in user_dict_pos: flag = user_dict_pos[word] flag = mapping_filter(map_pos, flag) if list_index_of(stop_pos_tags, flag) > -1: continue if save_pos_tag_field == "false": s.append(word + pos_tag_separator + flag) else: s.append(word) p.append(flag) seg_list_filtered_count = seg_list_filtered_count + 1 distinct_words = add_distinct_words( distinct_words, word) distinct_pos = add_distinct_words(distinct_pos, flag) else: # print(word) pypos_words = Lexer().lex(word) pypos_tagged_words = POSTagger().tag(pypos_words) for x in pypos_tagged_words: word = x[0] #word = mapping_filter(map_word, word) tag = "eng-" + x[1] # 強迫對應使用者詞表 if word in user_dict_pos: tag = user_dict_pos[word] # print(word) # print(tag) tag = mapping_filter(map_pos, tag) if list_index_of(stop_pos_tags, tag) > -1: # print(stop_pos_tags) #print(list_index_of(stop_pos_tags, tag)) #print("stop pos: " + tag) continue if save_pos_tag_field == "false": s.append(word + pos_tag_separator + tag) else: s.append(word) p.append(tag) seg_list_filtered_count = seg_list_filtered_count + 1 distinct_words = add_distinct_words( distinct_words, word) distinct_pos = add_distinct_words( distinct_pos, tag) #print('%s %s' % (word, flag)) s = (separator + " ").join(s) p = (separator + " ").join(p) else: seg_list_filtered_count = seg_list_filtered_count + 1 s = mapping_filter(map_word, s) distinct_words = add_distinct_words(distinct_words, s) if len(s) > 0: seg_list_filtered.append(s) pos_tag_list.append(p) # print(pos_tag_list) if save_pos_tag_field == "false" and enable_pos_tag == "false" and export_text_feature == "false": result = (separator + " ").join(seg_list_filtered) return result else: result = [] result.append((separator + " ").join(seg_list_filtered)) if enable_pos_tag == "true" and save_pos_tag_field == "true": result.append((separator + " ").join(pos_tag_list)) if export_text_feature == "true": result.append(str(len(list(distinct_pos.keys())))) # print(seg_list_filtered) # print(str(seg_list_filtered_count)) # 斷詞後的結果 result.append(str(seg_list_filtered_count)) # 詞性的種類 if enable_pos_tag == "true": result.append(str(len(distinct_pos.keys()))) # result.append("2") # 用詞的entropy entropy = 0 for word in distinct_words: freq = distinct_words[word] prop = freq / (seg_list_filtered_count * 1.0) if prop > 0: e = prop * log(prop) entropy = entropy + e entropy = entropy * -1 result.append(str(entropy)) # 詞性的entropy if enable_pos_tag == "true": entropy = 0 for pos in distinct_pos: freq = distinct_pos[pos] prop = freq / (seg_list_filtered_count * 1.0) if prop > 0: e = prop * log(prop) entropy = entropy + e entropy = entropy * -1 result.append(str(entropy)) return result
from POSTagger import POSTagger feature_set = ['w_i-1', 'w_i', 'w_i+1'] tagger = POSTagger('None', feature_set, 2, 1) tagger.fixedEval()
from POSTagger import POSTagger feature_set = ['t_i-1', 'w_i-1', 'w_i', 'w_i+1'] <<<<<<< HEAD tagger = POSTagger(1) tagger.fixedEval('L',feature_set, 1) ======= tagger = POSTagger('L', feature_set, 2, 1) tagger.fixedEval() >>>>>>> v2
def setup_environment(self): email = Email('object', 'object', 0) EMAILS_TRAINING_PATH = './Data/training/' EMAILS_TEST_UNTAGGED_PATH = './Data/seminars_testdata/test_untagged/' EMAILS_TEST_TAGGED_PATH = './Data/seminars_testdata/test_tagged/' POS_TAGGER_PATH = './Data/Models/pos_tagger_dt.pkl' emails_training = email.read_emails(EMAILS_TRAINING_PATH) emails_test_untagged = email.read_emails(EMAILS_TEST_UNTAGGED_PATH) emails_test_tagged = email.read_emails(EMAILS_TEST_TAGGED_PATH) pos_tagger = POSTagger() print("Application started.") # Ask user about POSTagger self.print_model_menu() user_input = raw_input() while True: if user_input.isdigit(): if 3 >= int(user_input) >= 1: if int(user_input) == 1: pos_tagger.train_pos_tagger(POS_TAGGER_PATH) break elif int(user_input) == 2: print("Loading POSTagger Model...") pos_tagger.load_pos_tagger(POS_TAGGER_PATH) print("POSTagger loaded.") break elif int(user_input) == 3: print("Shutting down application...") quit() else: print("Invalid input") self.print_model_menu() user_input = raw_input() else: print("Invalid input") self.print_model_menu() user_input = raw_input() # Load Spacy model print("Loading Spacy Model...") spacy_model = spacy.load('en_core_web_sm') print("Spacy Model loaded.") # ----------------------------------------------------------------- # Make sure to download Google News Data set and save to Models folder. Otherwise, can not classify emails print("Loading Google News Data-set...") # Will throw an exception if no Data Set found (of course) k_v_model = KeyedVectors.load_word2vec_format( './Data//Models/GoogleNews-vectors-negative300.bin', binary=True) print("Google News Data-set loaded.") # ----------------------------------------------------------------- taggers = Taggers() self.start_up(emails_training, emails_test_untagged, emails_test_tagged, pos_tagger, spacy_model, k_v_model, taggers)