示例#1
0
def naver_card_info():
    url = "https://card.search.naver.com/card.naver?singleCardId=20"
    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")
    div_tags = soup.find("div",
                         {"class": "_detail_1 sum_one sum_one_v1 _tab_detail"})
    tr_tags = div_tags.find_all("tr")
    del tr_tags[0]
    t = list(tr_tags[1].strings)
    lists = []
    for temp in t:
        if temp == '\n':
            del temp
        else:
            lists.append(temp)

    twitter = Twitter()
    nouns = twitter.nouns(' '.join(lists))
    pos = twitter.pos(' '.join(lists))
    morph = twitter.morphs(' '.join(lists))
    phrases = twitter.phrases(' '.join(lists))

    count = Counter(nouns)
    print(lists)
    print(pos)
    print(morph)
    print(phrases)
    print(nouns)
    print(count)
示例#2
0
class Twitter:
    def __init__(self, use_twitter_dictionary=True):
        self._base = KoNLPyTwitter()
        self._dictionary = CustomizedDictionary()
        self._loaded_twitter_default_dictionary = use_twitter_dictionary
        if use_twitter_dictionary:
            self._load_default_dictionary()
        self._customized_tagger = self._load_customized_tagger()
        self.tagset = tagset

    def _load_customized_tagger(self):
        templatespath = '%s/data/templates/twitter_templates0' % installpath
        templates = loadtxt(templatespath)
        templates = [tuple(template.split()) for template in templates]
        return SimpleTemplateTagger(templates, self._dictionary,
                                    SimpleSelector())

    def _load_default_dictionary(self):
        directory = '%s/data/twitter/' % installpath
        self._dictionary.add_dictionary(
            load_dictionary('%s/noun' % directory, ignore_a_syllable=True),
            'Noun')

    def pos(self, phrase):
        eojeols = phrase.split()
        tagged = []
        for eojeol in eojeols:
            tagged0 = self._customized_tagger.pos(eojeol)
            if tagged0:
                tagged += tagged0
                continue
            tagged += self._base.pos(eojeol)
        return tagged

    def nouns(self, phrase):
        tagged = self.pos(phrase)
        return [w for w, t in tagged if t == 'Noun']

    def morphs(self, phrase, norm=False, stem=False):
        return [s for s, t in self.pos(phrase)]

    def phrases(self, phrase):
        # TODO
        return self._base.phrases(phrase)

    def add_dictionary(self, words, tag, force=False):
        if (not force) and (not (tag in self.tagset)):
            raise ValueError('%s is not available tag' % tag)
        self._dictionary.add_dictionary(words, tag)

    def load_dictionary(self, fname_list, tag):
        if not (tag in self.tagset):
            raise ValueError('%s is not available tag' % tag)
        self._dictionary.load_dictionary(fname_list, tag)

    def set_selector(self, my_weight_dict, my_score_function):
        self._customized_tagger.set_selector(my_weight_dict, my_score_function)
示例#3
0
def extract_phrases_txt():
    twitter = Twitter()
    read_file = io.open('data_cleansed.txt', 'r', encoding='utf-8')
    write_file = codecs.open('phrases.txt', 'w', encoding='utf-8')
    for line in read_file:
        item = ' '.join(("{}".format(word)) for word in twitter.phrases(line))
        write_file.write(item)
        # print(item)
    read_file.close()
    write_file.close()
示例#4
0
class Twitter:
    def __init__(self, load_default_dictionary=False):
        self._base = KoNLPyTwitter()
        self._dictionary = CustomizedDictionary()
        if load_default_dictionary:
            self._load_default_dictionary()
        self._customized_tagger = self._load_customized_tagger()
        self.tagset = tagset

    def _load_customized_tagger(self):
        templatespath = '%s/data/templates/twitter_templates0' % installpath
        templates = loadtxt(templatespath)
        templates = [tuple(template.split()) for template in templates]
        selector = TwitterSelector()
        return SimpleTemplateTagger(templates, self._dictionary, selector)

    def _load_default_dictionary(self):
        josapath = '%s/data/twitter/josa.txt' % installpath
        modifierpath = '%s/data/twitter/modifier.txt' % installpath
        self._dictionary.add_dictionary(loadtxt(josapath), 'Josa')
        self._dictionary.add_dictionary(loadtxt(modifierpath), 'Modifier')

    def pos(self, phrase):
        eojeols = phrase.split()
        tagged = []
        for eojeol in eojeols:
            tagged0 = self._customized_tagger.pos(eojeol)
            if tagged0:
                tagged += tagged0
                continue
            tagged += self._base.pos(eojeol)
        return tagged

    def nouns(self, phrase):
        tagged = self.pos(phrase)
        return [w for w, t in tagged if t[0] == 'N']

    def morphs(self, phrase, norm=False, stem=False):
        return [s for s, t in self.pos(phrase)]

    def phrases(self, phrase):
        # TODO
        return self._base.phrases(phrase)

    def add_dictionary(self, words, tag, force=False):
        if (not force) and (not (tag in self.tagset)):
            raise ValueError('%s is not available tag' % tag)
        self._dictionary.add_dictionary(words, tag)

    def load_dictionary(self, fname_list, tag):
        if not (tag in self.tagset):
            raise ValueError('%s is not available tag' % tag)
        self._dictionary.load_dictionary(fname_list, tag)
示例#5
0
def extract_phrases_csv():
    print("\nNow extract phrases ... ")
    data = io.open('data_cleansed.txt', 'r', encoding='utf-8')
    words = list()
    tw = Twitter()
    for item in data:
        words = words + tw.phrases(item)
    print("\nNow save as csv file ... ")
    csvfile = 'phrases.csv'
    with open(csvfile, 'w', encoding='utf-8') as output:
        writer = csv.writer(output, lineterminator='\n')
        for val in words:
            writer.writerow([val])
示例#6
0
	def parse_konlpy(self, text):
		from konlpy.tag import Kkma

		kkma = Kkma()

		# --

		from konlpy.tag import Twitter

		twitter = Twitter()

		# --

		sentence_list = kkma.sentences(text)

		# --

		parsing = []

		for sentence in sentence_list:
			parsed_sentence = {}

			# --

			parsed_sentence['text'] = sentence

			# --

			parsed_sentence['morp'] = kkma.pos(sentence)

			# --

			parsed_sentence['phrase'] = twitter.phrases(sentence)

			# --

			parsing.append(parsed_sentence)

		# --

		return parsing
'''

t = Twitter()
text = t.pos("아버지가방에들어가신다",norm=True,stem=True)
text
t.pos("아버지가방에들어가신다",norm=False,stem=False)
# norm : "그래욕 ㅋㅋㅋ" -> 그래요
# stem : "그렇다" 원형을 찾아 준다.

>>> from konlpy.tag import Twitter
>>> twitter = Twitter()
>>> print(twitter.morphs(u'단독입찰보다 복수입찰의 경우'))
# ['단독', '입찰', '보다', '복수', '입찰', '의', '경우', '가']
>>> print(twitter.nouns(u'유일하게 항공기 체계 종합개발 경험을 갖고 있는 KAI는'))
# ['유일하', '항공기', '체계', '종합', '개발', '경험']
>>> print(twitter.phrases(u'날카로운 분석과 신뢰감 있는 진행으로'))
# ['분석', '분석과 신뢰감', '신뢰감', '분석과 신뢰감 있는 진행', '신뢰감 있는 진행', '진행', '신뢰']
>>> print(twitter.pos(u'이것도 되나욬ㅋㅋ'))
# [('이', 'Determiner'), ('것', 'Noun'), ('도', 'Josa'), ('되나욬', 'Noun'), ('ㅋㅋ', 'KoreanParticle')]
>>> print(twitter.pos(u'이것도 되나욬ㅋㅋ', norm=True))
# [('이', 'Determiner'), ('것', 'Noun'), ('도', 'Josa'), ('되', 'Verb'), ('나요', 'Eomi'), ('ㅋㅋ', 'KoreanParticle')]
>>> print(twitter.pos(u'이것도 되나욬ㅋㅋ', norm=True, stem=True))
# [('이', 'Determiner'), ('것', 'Noun'), ('도', 'Josa'), ('되다', 'Verb'), ('ㅋㅋ', 'KoreanParticle')]



## 꼬꼬마 분석기 (http://kkma.snu.ac.kr/)
''' 휴리스틱, 히든 마르코프 모델, 편집거리'''

from konlpy.tag import Kkma
k = Kkma()
示例#8
0
def start(ays_type, sentences, wf, logger):
    analysis_type = ays_type  ## noun:명사분석, phrase:의미단위분석, pos:품사태깅

    twitter = Twitter()  ## 트위터에서 만든 한국어 형태소 분석기

    ## analysis 1 : 명사 분석
    if analysis_type == 'noun':
        cnt = 1
        nounsDic = {}

        for sentence in sentences:
            nouns = twitter.nouns(sentence)  ## 명사만 추출
            for noun in nouns:
                if noun in nounsDic:  ## 있다면 +1
                    now_cnt = nounsDic[noun]
                    nounsDic[noun] = now_cnt + 1
                else:  ## 없다면 추가
                    nounsDic[noun] = 1
            print('sentence(' + str(cnt) + ') analyzing......')
            cnt = cnt + 1

        nounsCounter = Counter(
            nounsDic).most_common()  # 내림차순 정렬 반환값 [(key,value), (key,value)]
        #print(nounsCounter)

        for key, value in nounsCounter:
            #print(key + ' // ' + str(value))
            if value >= 10:
                wf.write(key + '\t' + str(value) + '\n')

    #    for key in nounsDic.keys():
    #        #print(key + ' // ' + str(nounsDic[key]))
    #        wf.write(key + '\t' + str(nounsDic[key]) + '\n')

        print('sentences analysis is finished....')

    ## analysis 2 : 의미단위 분석
    if analysis_type == 'phrase':
        phrasesDic = {}
        cnt = 1

        for sentence in sentences:
            phrases = twitter.phrases(sentence)  ## 의미단위 추출
            for phrase in phrases:
                if phrase in phrasesDic:  ## 있다면 +1
                    now_cnt = phrasesDic[phrase]
                    phrasesDic[phrase] = now_cnt + 1
                else:  ## 없다면 추가
                    phrasesDic[phrase] = 1
            print('sentence(' + str(cnt) + ') analyzing......')
            cnt = cnt + 1

        phrasesCounter = Counter(
            phrasesDic).most_common()  # 내림차순 정렬 반환값 [(key,value), (key,value)]
        #print(nounsCounter)

        for key, value in phrasesCounter:
            #print(key + ' // ' + str(value))
            if value >= 10:
                wf.write(key + '\t' + str(value) + '\n')

    #    for key in phrasesDic.keys():
    #        #print(key + ' // ' + str(phrasesDic[key]))
    #        wf.write(key + '\t' + str(phrasesDic[key]) + '\n')

        print('sentences analysis is finished....')

    ## analysis 3 : 품사 태깅 분석
    if analysis_type == 'pos':
        posesDic = {}
        cnt = 1

        for sentence in sentences:
            poses = twitter.pos(sentence, norm=True, stem=True)  ## 품사태깅
            for pos in poses:
                if pos in posesDic:  ## 있다면 +1
                    now_cnt = posesDic[pos]
                    posesDic[pos] = now_cnt + 1
                else:  ## 없다면 추가
                    posesDic[pos] = 1
            print('sentence(' + str(cnt) + ') analyzing......')
            cnt = cnt + 1

        posesCounter = Counter(
            posesDic).most_common()  # 내림차순 정렬 반환값 [(key,value), (key,value)]
        #print(nounsCounter)

        for key, value in posesCounter:
            noun, tagging = key
            #print(noun + ' // ' + tagging + '//' + str(posesDic[key]))
            if value >= 10:
                wf.write(noun + '\t' + tagging + '\t' + str(value) + '\n')

    #    for key in posesDic.keys():
    #        noun, tagging = key
    #        #print(noun + ' // ' + tagging + '//' + str(posesDic[key]))
    #        wf.write(noun + '\t' + tagging + '\t' + str(posesDic[key]) + '\n')

        print('sentences analysis is finished....')
示例#9
0
class Twitter:
    def __init__(self, load_default_dictionary=True):
        self._base = KoNLPyTwitter()
        self._dictionary = CustomizedDictionary()
        if load_default_dictionary:
            self._load_default_dictionary()
        self._customized_tagger = self._load_customized_tagger()
        self.tagset = tagset
        self.add_custom_dictionary('?', 'Punctuation')
    
    def _load_customized_tagger(self):        
        templatespath = '%s/data/templates/twitter_templates0' % installpath
        templates = loadtxt(templatespath)
        templates = [tuple(template.split()) for template in templates]        
        return SimpleTemplateTagger(templates, self._dictionary, SimpleSelector())
    
    def _load_default_dictionary(self):
        directory = '%s/data/twitter/' % installpath
        self._dictionary.add_dictionary(load_dictionary('%s/josa' % directory), 'Josa')
        self._dictionary.add_dictionary(load_dictionary('%s/noun' % directory, ignore_a_syllable=True), 'Noun')
        self._dictionary.add_dictionary(load_dictionary('%s/adverb' % directory), 'Adverb')
        #self._dictionary.add_dictionary(load_dictionary(modifier_dir), 'Modifier')
        
    def pos(self, phrase):
        eojeols = phrase.split()
        print(eojeols)
        tagged = []
        for idx, eojeol in enumerate(eojeols):
            tagged0 = self._customized_tagger.pos(eojeol)
            if tagged0:
                tagged += tagged0
                continue
            if idx < len(eojeols)-1:
                tagged0 = self._customized_tagger.pos(eojeol + ' ' + eojeols[idx+1])
                if tagged0:
                    tagged += tagged0
                    continue
            tagged += self._base.pos(eojeol)
        return tagged
    
    def nouns(self, phrase):
        tagged = self.pos(phrase)
        return [w for w, t in tagged if t[0] == 'N']
    
    def morphs(self, phrase, norm=False, stem=False):
        return [s for s, t in self.pos(phrase)]
    
    def phrases(self, phrase):
        # TODO
        return self._base.phrases(phrase)
    
    def add_dictionary(self, words, tag, force=False):
        if (not force) and (not (tag in self.tagset)):
            raise ValueError('%s is not available tag' % tag)
        self._dictionary.add_dictionary(words, tag)

    def add_custom_dictionary(self, words, tag, force=True):
        if (not force) and (not (tag in self.tagset)):
            raise ValueError('%s is not available tag' % tag)
        self._dictionary.add_custom_dictionary(words, tag)

    def load_dictionary(self, fname_list, tag):
        if not (tag in self.tagset):
            raise ValueError('%s is not available tag' % tag)
        self._dictionary.load_dictionary(fname_list, tag)
    
    def set_selector(self, my_weight_dict, my_score_function):
        self._customized_tagger.set_selector(my_weight_dict, my_score_function)
示例#10
0
        nounsDic).most_common()  # 내림차순 정렬 반환값 [(key,value), (key,value)]
    for key, value in nounsCounter:
        #print(key + ' // ' + str(value))
        if value >= 10:
            wf.write(key + '\t' + str(value) + '\n')

    print('sentences analysis is finished....')
    wf.close()

## analysis 2 : 의미단위 분석
if analysis_type == 'pos':
    phrasesDic = {}
    cnt = 1

    for sentence in sentences:
        phrases = twitter.phrases(sentence)  ## 의미단위 추출
        for phrase in phrases:
            if phrase in phrasesDic:  ## 있다면 +1
                now_cnt = phrasesDic[phrase]
                phrasesDic[phrase] = now_cnt + 1
            else:  ## 없다면 추가
                phrasesDic[phrase] = 1
        print('sentence(' + str(cnt) + ') analyzing......')
        cnt = cnt + 1

    wf = codecs.open(
        'D:\\smba2_crawler\\result\\happytogether_talk_phrases_result.txt',
        encoding='utf-8',
        mode='w')

    phrasesCounter = Counter(
示例#11
0
from konlpy import jvm
from konlpy.tag import Twitter
from konlpy.tag import Kkma
from konlpy.tag import Hannanum
from konlpy.tag import Komoran

jvm.init_jvm()

########    use Twitter    ########
twitter = Twitter()

input_text = "한국어를 처리하는 예시입니다"
normalizations = twitter.phrases(input_text)
tokenizations = twitter.pos(input_text)

aftertokenization = twitter.pos(input_text, norm=True)

print(' ')
print('########    Twitter    ########')
print('')
print('normalization : ')
print(normalizations)
print('')
print('tokenization : ')
print(tokenizations)
print('')
print('tokenization after normalization : ')
print(aftertokenization)
print('')