示例#1
0
    def __init__(self, text, product_name):
        self.candidate_features = []
        self.feature_sentences = []
        self.product_name = product_name.lower().split('-')[0].split('_')
        t = Tokenizer()
        sents = t.sent_tokenize(text.lower())
        p = POSTagger()
        wnl = WordNetLemmatizer()
        for sent in sents:
            tagged_sent = p.nltk_tag(t.word_tokenize(sent))
            feature_sent = {}
            feature_sent['sentence'] = sent
            feature_sent['tags'] = tagged_sent
            feature_sent['nouns'] = []
            feature_sent['noun_phrases'] = []
            for i in range(0, len(tagged_sent)):
                (word, tag) = tagged_sent[i]
                #Don't include proper nouns
                if tag.startswith('N') and tag != 'NNP':
                    """
					Consecutive nouns might form a feature phrase. Eg. Picture quality is a phrase.
					Meaningless phrases like 'quality digital' are removed later as their frequeny of occurence is	low. """
                    if i > 0 and len(
                            feature_sent['nouns']
                    ) > 0 and tagged_sent[i - 1][0] == feature_sent['nouns'][
                            -1] and feature_sent['sentence'].find(
                                feature_sent['nouns'][-1] + ' ' + word) > -1:
                        feature_sent['noun_phrases'].append(
                            wnl.lemmatize(feature_sent['nouns'].pop() + ' ' +
                                          word))
                    else:
                        feature_sent['nouns'].append(wnl.lemmatize(word))

            self.feature_sentences.append(feature_sent)
示例#2
0
    def init_feature_sentences(self, total_content):
        t = Tokenizer()
        p = POSTagger()
        wnl = WordNetLemmatizer()

        sentences = t.sent_tokenize(total_content.lower())

        for sentence in sentences:
            tagged_sentence = p.ntlk_tag(t.word_tokenize(sentence))

            #Initializing Feature Sentence dictionary
            feature_sentence = {}
            feature_sentence['sentence'] = sentence
            feature_sentence['tags'] = tagged_sentence
            feature_sentence['nouns'] = []
            feature_sentence['noun_phrases'] = []

            #Finding the Nouns/Noun Phrases in the tagged sentence
            for i in range(0,len(tagged_sentence)):
                (word, tag) = tagged_sentence[i]

                #Chunking
                if tag.startswith('N') and tag != 'NNP':
                    if i > 0 and len(feature_sentence['nouns']) > 0 and tagged_sentence[i - 1][0] == feature_sentence['nouns'][-1] and feature_sentence['sentence'].find(feature_sentence['nouns'][-1] + ' ' + word) > -1:
                        feature_sentence['noun_phrases'].append(wnl.lemmatize(feature_sentence['nouns'].pop() + ' ' + word))
                    else:
                        feature_sentence['nouns'].append(wnl.lemmatize(word))

            self.feature_sentences.append(feature_sentence)
示例#3
0
def train_tagger():
	#Train on brown corpus
	tagger = POSTagger(simplify=True)
	tagger.train(support_cutoff=2)
	nums = tagger.word_features.unique_values()

	return tagger
	def __init__(self, text, product_name):
		self.candidate_features = []
		self.feature_sentences = []
		self.product_name = product_name.lower().split('-')[0].split('_')
		t = Tokenizer()
		sents = t.sent_tokenize(text.lower())
		p = POSTagger()
		wnl = WordNetLemmatizer()
		for sent in sents:
			tagged_sent = p.nltk_tag(t.word_tokenize(sent))
			feature_sent = {}
			feature_sent['sentence'] = sent
			feature_sent['tags'] = tagged_sent
			feature_sent['nouns'] = []
			feature_sent['noun_phrases'] = []
			for i in range(0, len(tagged_sent)):
				(word, tag) = tagged_sent[i]
				#Don't include proper nouns
				if tag.startswith('N') and tag != 'NNP':
					"""
					Consecutive nouns might form a feature phrase. Eg. Picture quality is a phrase.
					Meaningless phrases like 'quality digital' are removed later as their frequeny of occurence is	low. """
					if i > 0 and len(feature_sent['nouns']) > 0 and tagged_sent[i - 1][0] == feature_sent['nouns'][-1] and feature_sent['sentence'].find(feature_sent['nouns'][-1] + ' ' + word) > -1:
						feature_sent['noun_phrases'].append(wnl.lemmatize(feature_sent['nouns'].pop() + ' ' + word))
					else:
						feature_sent['nouns'].append(wnl.lemmatize(word))
					
			self.feature_sentences.append(feature_sent)
示例#5
0
class TestPyPOS(unittest.TestCase):
	def setUp(self):
		'''Tests Setup'''
		self.lexer = Lexer()
		self.tagger = POSTagger()
		self.start = time.time()
	def stringTest(self,string):
		'''Common Testing Function'''
		self.words = self.lexer.lex(string)
		self.tags = self.tagger.tag(self.words)
		self.end = time.time()
		self.difference = self.end - self.start
		for tag in self.tags:
			print " / ".join(tag)
	def test_1_Short(self):
		'''Test Short String'''
		global shortTestString
		self.stringTest(shortTestString)
	def test_2_Long(self):
		'''Test Long String'''
		global testString
		self.stringTest(testString)
	def tearDown(self):
		print "Tokenized and tagged %s words in %s seconds" % (len(self.words),self.difference)
		print "Running time at test end was: %s seconds" % (time.time() - STARTTIME)
示例#6
0
class TestPyPOS(unittest.TestCase):
    def setUp(self):
        '''Tests Setup'''
        self.lexer = Lexer()
        self.tagger = POSTagger()
        self.start = time.time()

    def stringTest(self, string):
        '''Common Testing Function'''
        self.words = self.lexer.lex(string)
        self.tags = self.tagger.tag(self.words)
        self.end = time.time()
        self.difference = self.end - self.start
        for tag in self.tags:
            print " / ".join(tag)

    def test_1_Short(self):
        '''Test Short String'''
        global shortTestString
        self.stringTest(shortTestString)

    def test_2_Long(self):
        '''Test Long String'''
        global testString
        self.stringTest(testString)

    def tearDown(self):
        print "Tokenized and tagged %s words in %s seconds" % (len(
            self.words), self.difference)
        print "Running time at test end was: %s seconds" % (time.time() -
                                                            STARTTIME)
	def __init__(self, text):
		self.candidate_features = []
		self.feature_sentences = []
		t = Tokenizer()
		sents = t.sent_tokenize(text)
		p = POSTagger()
		for sent in sents:
			tagged_sent = p.nltk_tag(t.nltk_tokenize(sent))
			feature_sent = {}
			feature_sent['sentence'] = sent
			feature_sent['nouns'] = []
			feature_sent['noun_phrases'] = []
			for i in range(0, len(tagged_sent)):
				(word, tag) = tagged_sent[i]
				if tag.startswith('N') and tag != 'NNP':
					if i > 0 and len(feature_sent['nouns']) > 0 and tagged_sent[i - 1][0] == feature_sent['nouns'][-1]:
						feature_sent['noun_phrases'].append(feature_sent['nouns'].pop() + ' ' + word)
					else:
						feature_sent['nouns'].append(word)
					
			self.feature_sentences.append(feature_sent)
示例#8
0
def evalTaggerLRT_w1_s1():
    feature_set = ['t_i-1,i+1','t_i-1', 't_i+1', 'w_i']
    tagger = POSTagger(1)
    tagger.fixedEval('LR',feature_set, 1)
示例#9
0
def evalTaggerR_w1_s1():
    feature_set = ['t_i+1', 'w_i']
    tagger = POSTagger(1)
    tagger.fixedEval('R',feature_set, 1)
示例#10
0
def evalTaggerL_w3_s1():
    feature_set = ['t_i-1', 'w_i-1', 'w_i', 'w_i+1']
    tagger = POSTagger(1)
    tagger.fixedEval('L',feature_set, 1)
示例#11
0
def evalTagger(N):
    feature_set = ['t_i-1', 'w_i']
    support_cutoff = 2
    tagger = POSTagger(simplify=True, tag_cutoff=1)
    tagger.randEvalL(N, feature_set, support_cutoff)
    return tagger
示例#12
0
 def setUp(self):
     '''Tests Setup'''
     self.lexer = Lexer()
     self.tagger = POSTagger()
     self.start = time.time()
示例#13
0
def exec_segment(content):
    try:
        content = str(content, 'utf-8')
    except TypeError:
        # print(TypeError)
        pass

    # 在這裡要先把要更換的字替換掉
    for word in map_word:
        map_to = map_word[word]
        content = content.replace(word, map_to)

    # 把換行換掉
    content = content.replace("\n", " ")
    content = content.strip()

    seg_list = []
    if mode == "exact":
        seg_list = jieba.cut(content, cut_all=False)
    elif mode == "all":
        seg_list = jieba.cut(content, cut_all=True)
    elif mode == "search":
        seg_list = jieba.cut_for_search(content)
    elif mode == "mix":
        temp_seg_list = jieba.cut_for_search(content)
        for s in temp_seg_list:
            seg_list.append(s)

        temp_seg_list = jieba.cut(content, cut_all=True)
        temp_seg_list = cut_result_to_list(temp_seg_list)
        for j, t in enumerate(temp_seg_list):
            t = temp_seg_list[(len(temp_seg_list) - j - 1)]
            if list_index_of(seg_list, t) == -1:
                # 如果找不到這個字...再來決定要插入在那個位置
                found = False
                for i, s in enumerate(seg_list):
                    if in_string(t, s):
                        # 位置在i
                        if len(s) > len(t):
                            i = i + 1
                        seg_list.insert(i, t)
                        found = True
                        break
                if found == False:
                    seg_list.append(t)

        temp_seg_list = jieba.cut(content, cut_all=False)
        temp_seg_list = cut_result_to_list(temp_seg_list)
        for j, t in enumerate(temp_seg_list):
            t = temp_seg_list[(len(temp_seg_list) - j - 1)]
            if list_index_of(seg_list, t) == -1:
                # 如果找不到這個字...再來決定要插入在那個位置
                found = False
                for i, s in enumerate(seg_list):
                    if in_string(t, s):
                        # 位置在i
                        if len(s) > len(t):
                            i = i + 1
                        seg_list.insert((i + 1), t)
                        found = True
                        break
                if found == False:
                    seg_list.append(t)

    else:
        seg_list = jieba.cut(content, cut_all=False)

    seg_list_filtered = []
    pos_tag_list = []
    seg_list_filtered_count = 0
    distinct_words = {}
    distinct_pos = {}

    for s in seg_list:
        if s.strip() == "":
            continue

        try:
            stopword_index = stopwords.index(s)
        except ValueError:
            p = []
            if enable_pos_tag == "true":
                words = pseg.cut(s)
                s = []
                p = []
                for word, flag in words:
                    if isEnglish(word):
                        flag = "eng"

                    if flag != "eng":
                        if word in user_dict_pos:
                            flag = user_dict_pos[word]

                        flag = mapping_filter(map_pos, flag)
                        if list_index_of(stop_pos_tags, flag) > -1:
                            continue

                        if save_pos_tag_field == "false":
                            s.append(word + pos_tag_separator + flag)
                        else:
                            s.append(word)
                            p.append(flag)
                        seg_list_filtered_count = seg_list_filtered_count + 1
                        distinct_words = add_distinct_words(
                            distinct_words, word)
                        distinct_pos = add_distinct_words(distinct_pos, flag)
                    else:
                        # print(word)
                        pypos_words = Lexer().lex(word)
                        pypos_tagged_words = POSTagger().tag(pypos_words)
                        for x in pypos_tagged_words:
                            word = x[0]
                            #word = mapping_filter(map_word, word)
                            tag = "eng-" + x[1]

                            # 強迫對應使用者詞表
                            if word in user_dict_pos:
                                tag = user_dict_pos[word]

                            # print(word)
                            # print(tag)
                            tag = mapping_filter(map_pos, tag)
                            if list_index_of(stop_pos_tags, tag) > -1:
                                # print(stop_pos_tags)
                                #print(list_index_of(stop_pos_tags, tag))
                                #print("stop pos: " + tag)
                                continue

                            if save_pos_tag_field == "false":
                                s.append(word + pos_tag_separator + tag)
                            else:
                                s.append(word)
                                p.append(tag)
                            seg_list_filtered_count = seg_list_filtered_count + 1
                            distinct_words = add_distinct_words(
                                distinct_words, word)
                            distinct_pos = add_distinct_words(
                                distinct_pos, tag)
                    #print('%s %s' % (word, flag))
                s = (separator + " ").join(s)
                p = (separator + " ").join(p)
            else:
                seg_list_filtered_count = seg_list_filtered_count + 1
                s = mapping_filter(map_word, s)
                distinct_words = add_distinct_words(distinct_words, s)

            if len(s) > 0:
                seg_list_filtered.append(s)
                pos_tag_list.append(p)
    # print(pos_tag_list)
    if save_pos_tag_field == "false" and enable_pos_tag == "false" and export_text_feature == "false":
        result = (separator + " ").join(seg_list_filtered)
        return result
    else:
        result = []

        result.append((separator + " ").join(seg_list_filtered))

        if enable_pos_tag == "true" and save_pos_tag_field == "true":
            result.append((separator + " ").join(pos_tag_list))

        if export_text_feature == "true":
            result.append(str(len(list(distinct_pos.keys()))))
            # print(seg_list_filtered)
            # print(str(seg_list_filtered_count))

            # 斷詞後的結果
            result.append(str(seg_list_filtered_count))

            # 詞性的種類
            if enable_pos_tag == "true":
                result.append(str(len(distinct_pos.keys())))
            # result.append("2")

            # 用詞的entropy
            entropy = 0
            for word in distinct_words:
                freq = distinct_words[word]
                prop = freq / (seg_list_filtered_count * 1.0)
                if prop > 0:
                    e = prop * log(prop)
                    entropy = entropy + e
            entropy = entropy * -1
            result.append(str(entropy))

            # 詞性的entropy
            if enable_pos_tag == "true":
                entropy = 0
                for pos in distinct_pos:
                    freq = distinct_pos[pos]
                    prop = freq / (seg_list_filtered_count * 1.0)
                    if prop > 0:
                        e = prop * log(prop)
                        entropy = entropy + e
                entropy = entropy * -1
                result.append(str(entropy))
        return result
示例#14
0
	def setUp(self):
		'''Tests Setup'''
		self.lexer = Lexer()
		self.tagger = POSTagger()
		self.start = time.time()
示例#15
0
from POSTagger import POSTagger
feature_set = ['w_i-1', 'w_i', 'w_i+1']
tagger = POSTagger('None', feature_set, 2, 1)
tagger.fixedEval()
示例#16
0
from POSTagger import POSTagger
feature_set = ['t_i-1', 'w_i-1', 'w_i', 'w_i+1']
<<<<<<< HEAD
tagger = POSTagger(1)
tagger.fixedEval('L',feature_set, 1)
=======
tagger = POSTagger('L', feature_set, 2, 1)
tagger.fixedEval()
>>>>>>> v2
    def setup_environment(self):

        email = Email('object', 'object', 0)

        EMAILS_TRAINING_PATH = './Data/training/'
        EMAILS_TEST_UNTAGGED_PATH = './Data/seminars_testdata/test_untagged/'
        EMAILS_TEST_TAGGED_PATH = './Data/seminars_testdata/test_tagged/'
        POS_TAGGER_PATH = './Data/Models/pos_tagger_dt.pkl'

        emails_training = email.read_emails(EMAILS_TRAINING_PATH)
        emails_test_untagged = email.read_emails(EMAILS_TEST_UNTAGGED_PATH)
        emails_test_tagged = email.read_emails(EMAILS_TEST_TAGGED_PATH)

        pos_tagger = POSTagger()

        print("Application started.")

        # Ask user about POSTagger
        self.print_model_menu()

        user_input = raw_input()

        while True:
            if user_input.isdigit():
                if 3 >= int(user_input) >= 1:
                    if int(user_input) == 1:
                        pos_tagger.train_pos_tagger(POS_TAGGER_PATH)
                        break
                    elif int(user_input) == 2:
                        print("Loading POSTagger Model...")
                        pos_tagger.load_pos_tagger(POS_TAGGER_PATH)
                        print("POSTagger loaded.")
                        break
                    elif int(user_input) == 3:
                        print("Shutting down application...")
                        quit()
                else:
                    print("Invalid input")
                    self.print_model_menu()
                    user_input = raw_input()
            else:
                print("Invalid input")
                self.print_model_menu()
                user_input = raw_input()

        # Load Spacy model
        print("Loading Spacy Model...")
        spacy_model = spacy.load('en_core_web_sm')
        print("Spacy Model loaded.")

        # -----------------------------------------------------------------

        # Make sure to download Google News Data set and save to Models folder. Otherwise, can not classify emails
        print("Loading Google News Data-set...")
        # Will throw an exception if no Data Set found (of course)
        k_v_model = KeyedVectors.load_word2vec_format(
            './Data//Models/GoogleNews-vectors-negative300.bin', binary=True)
        print("Google News Data-set loaded.")

        # -----------------------------------------------------------------

        taggers = Taggers()

        self.start_up(emails_training, emails_test_untagged,
                      emails_test_tagged, pos_tagger, spacy_model, k_v_model,
                      taggers)