def naver_card_info(): url = "https://card.search.naver.com/card.naver?singleCardId=20" html = requests.get(url).text soup = BeautifulSoup(html, "html.parser") div_tags = soup.find("div", {"class": "_detail_1 sum_one sum_one_v1 _tab_detail"}) tr_tags = div_tags.find_all("tr") del tr_tags[0] t = list(tr_tags[1].strings) lists = [] for temp in t: if temp == '\n': del temp else: lists.append(temp) twitter = Twitter() nouns = twitter.nouns(' '.join(lists)) pos = twitter.pos(' '.join(lists)) morph = twitter.morphs(' '.join(lists)) phrases = twitter.phrases(' '.join(lists)) count = Counter(nouns) print(lists) print(pos) print(morph) print(phrases) print(nouns) print(count)
class Twitter: def __init__(self, use_twitter_dictionary=True): self._base = KoNLPyTwitter() self._dictionary = CustomizedDictionary() self._loaded_twitter_default_dictionary = use_twitter_dictionary if use_twitter_dictionary: self._load_default_dictionary() self._customized_tagger = self._load_customized_tagger() self.tagset = tagset def _load_customized_tagger(self): templatespath = '%s/data/templates/twitter_templates0' % installpath templates = loadtxt(templatespath) templates = [tuple(template.split()) for template in templates] return SimpleTemplateTagger(templates, self._dictionary, SimpleSelector()) def _load_default_dictionary(self): directory = '%s/data/twitter/' % installpath self._dictionary.add_dictionary( load_dictionary('%s/noun' % directory, ignore_a_syllable=True), 'Noun') def pos(self, phrase): eojeols = phrase.split() tagged = [] for eojeol in eojeols: tagged0 = self._customized_tagger.pos(eojeol) if tagged0: tagged += tagged0 continue tagged += self._base.pos(eojeol) return tagged def nouns(self, phrase): tagged = self.pos(phrase) return [w for w, t in tagged if t == 'Noun'] def morphs(self, phrase, norm=False, stem=False): return [s for s, t in self.pos(phrase)] def phrases(self, phrase): # TODO return self._base.phrases(phrase) def add_dictionary(self, words, tag, force=False): if (not force) and (not (tag in self.tagset)): raise ValueError('%s is not available tag' % tag) self._dictionary.add_dictionary(words, tag) def load_dictionary(self, fname_list, tag): if not (tag in self.tagset): raise ValueError('%s is not available tag' % tag) self._dictionary.load_dictionary(fname_list, tag) def set_selector(self, my_weight_dict, my_score_function): self._customized_tagger.set_selector(my_weight_dict, my_score_function)
def extract_phrases_txt(): twitter = Twitter() read_file = io.open('data_cleansed.txt', 'r', encoding='utf-8') write_file = codecs.open('phrases.txt', 'w', encoding='utf-8') for line in read_file: item = ' '.join(("{}".format(word)) for word in twitter.phrases(line)) write_file.write(item) # print(item) read_file.close() write_file.close()
class Twitter: def __init__(self, load_default_dictionary=False): self._base = KoNLPyTwitter() self._dictionary = CustomizedDictionary() if load_default_dictionary: self._load_default_dictionary() self._customized_tagger = self._load_customized_tagger() self.tagset = tagset def _load_customized_tagger(self): templatespath = '%s/data/templates/twitter_templates0' % installpath templates = loadtxt(templatespath) templates = [tuple(template.split()) for template in templates] selector = TwitterSelector() return SimpleTemplateTagger(templates, self._dictionary, selector) def _load_default_dictionary(self): josapath = '%s/data/twitter/josa.txt' % installpath modifierpath = '%s/data/twitter/modifier.txt' % installpath self._dictionary.add_dictionary(loadtxt(josapath), 'Josa') self._dictionary.add_dictionary(loadtxt(modifierpath), 'Modifier') def pos(self, phrase): eojeols = phrase.split() tagged = [] for eojeol in eojeols: tagged0 = self._customized_tagger.pos(eojeol) if tagged0: tagged += tagged0 continue tagged += self._base.pos(eojeol) return tagged def nouns(self, phrase): tagged = self.pos(phrase) return [w for w, t in tagged if t[0] == 'N'] def morphs(self, phrase, norm=False, stem=False): return [s for s, t in self.pos(phrase)] def phrases(self, phrase): # TODO return self._base.phrases(phrase) def add_dictionary(self, words, tag, force=False): if (not force) and (not (tag in self.tagset)): raise ValueError('%s is not available tag' % tag) self._dictionary.add_dictionary(words, tag) def load_dictionary(self, fname_list, tag): if not (tag in self.tagset): raise ValueError('%s is not available tag' % tag) self._dictionary.load_dictionary(fname_list, tag)
def extract_phrases_csv(): print("\nNow extract phrases ... ") data = io.open('data_cleansed.txt', 'r', encoding='utf-8') words = list() tw = Twitter() for item in data: words = words + tw.phrases(item) print("\nNow save as csv file ... ") csvfile = 'phrases.csv' with open(csvfile, 'w', encoding='utf-8') as output: writer = csv.writer(output, lineterminator='\n') for val in words: writer.writerow([val])
def parse_konlpy(self, text): from konlpy.tag import Kkma kkma = Kkma() # -- from konlpy.tag import Twitter twitter = Twitter() # -- sentence_list = kkma.sentences(text) # -- parsing = [] for sentence in sentence_list: parsed_sentence = {} # -- parsed_sentence['text'] = sentence # -- parsed_sentence['morp'] = kkma.pos(sentence) # -- parsed_sentence['phrase'] = twitter.phrases(sentence) # -- parsing.append(parsed_sentence) # -- return parsing
''' t = Twitter() text = t.pos("아버지가방에들어가신다",norm=True,stem=True) text t.pos("아버지가방에들어가신다",norm=False,stem=False) # norm : "그래욕 ㅋㅋㅋ" -> 그래요 # stem : "그렇다" 원형을 찾아 준다. >>> from konlpy.tag import Twitter >>> twitter = Twitter() >>> print(twitter.morphs(u'단독입찰보다 복수입찰의 경우')) # ['단독', '입찰', '보다', '복수', '입찰', '의', '경우', '가'] >>> print(twitter.nouns(u'유일하게 항공기 체계 종합개발 경험을 갖고 있는 KAI는')) # ['유일하', '항공기', '체계', '종합', '개발', '경험'] >>> print(twitter.phrases(u'날카로운 분석과 신뢰감 있는 진행으로')) # ['분석', '분석과 신뢰감', '신뢰감', '분석과 신뢰감 있는 진행', '신뢰감 있는 진행', '진행', '신뢰'] >>> print(twitter.pos(u'이것도 되나욬ㅋㅋ')) # [('이', 'Determiner'), ('것', 'Noun'), ('도', 'Josa'), ('되나욬', 'Noun'), ('ㅋㅋ', 'KoreanParticle')] >>> print(twitter.pos(u'이것도 되나욬ㅋㅋ', norm=True)) # [('이', 'Determiner'), ('것', 'Noun'), ('도', 'Josa'), ('되', 'Verb'), ('나요', 'Eomi'), ('ㅋㅋ', 'KoreanParticle')] >>> print(twitter.pos(u'이것도 되나욬ㅋㅋ', norm=True, stem=True)) # [('이', 'Determiner'), ('것', 'Noun'), ('도', 'Josa'), ('되다', 'Verb'), ('ㅋㅋ', 'KoreanParticle')] ## 꼬꼬마 분석기 (http://kkma.snu.ac.kr/) ''' 휴리스틱, 히든 마르코프 모델, 편집거리''' from konlpy.tag import Kkma k = Kkma()
def start(ays_type, sentences, wf, logger): analysis_type = ays_type ## noun:명사분석, phrase:의미단위분석, pos:품사태깅 twitter = Twitter() ## 트위터에서 만든 한국어 형태소 분석기 ## analysis 1 : 명사 분석 if analysis_type == 'noun': cnt = 1 nounsDic = {} for sentence in sentences: nouns = twitter.nouns(sentence) ## 명사만 추출 for noun in nouns: if noun in nounsDic: ## 있다면 +1 now_cnt = nounsDic[noun] nounsDic[noun] = now_cnt + 1 else: ## 없다면 추가 nounsDic[noun] = 1 print('sentence(' + str(cnt) + ') analyzing......') cnt = cnt + 1 nounsCounter = Counter( nounsDic).most_common() # 내림차순 정렬 반환값 [(key,value), (key,value)] #print(nounsCounter) for key, value in nounsCounter: #print(key + ' // ' + str(value)) if value >= 10: wf.write(key + '\t' + str(value) + '\n') # for key in nounsDic.keys(): # #print(key + ' // ' + str(nounsDic[key])) # wf.write(key + '\t' + str(nounsDic[key]) + '\n') print('sentences analysis is finished....') ## analysis 2 : 의미단위 분석 if analysis_type == 'phrase': phrasesDic = {} cnt = 1 for sentence in sentences: phrases = twitter.phrases(sentence) ## 의미단위 추출 for phrase in phrases: if phrase in phrasesDic: ## 있다면 +1 now_cnt = phrasesDic[phrase] phrasesDic[phrase] = now_cnt + 1 else: ## 없다면 추가 phrasesDic[phrase] = 1 print('sentence(' + str(cnt) + ') analyzing......') cnt = cnt + 1 phrasesCounter = Counter( phrasesDic).most_common() # 내림차순 정렬 반환값 [(key,value), (key,value)] #print(nounsCounter) for key, value in phrasesCounter: #print(key + ' // ' + str(value)) if value >= 10: wf.write(key + '\t' + str(value) + '\n') # for key in phrasesDic.keys(): # #print(key + ' // ' + str(phrasesDic[key])) # wf.write(key + '\t' + str(phrasesDic[key]) + '\n') print('sentences analysis is finished....') ## analysis 3 : 품사 태깅 분석 if analysis_type == 'pos': posesDic = {} cnt = 1 for sentence in sentences: poses = twitter.pos(sentence, norm=True, stem=True) ## 품사태깅 for pos in poses: if pos in posesDic: ## 있다면 +1 now_cnt = posesDic[pos] posesDic[pos] = now_cnt + 1 else: ## 없다면 추가 posesDic[pos] = 1 print('sentence(' + str(cnt) + ') analyzing......') cnt = cnt + 1 posesCounter = Counter( posesDic).most_common() # 내림차순 정렬 반환값 [(key,value), (key,value)] #print(nounsCounter) for key, value in posesCounter: noun, tagging = key #print(noun + ' // ' + tagging + '//' + str(posesDic[key])) if value >= 10: wf.write(noun + '\t' + tagging + '\t' + str(value) + '\n') # for key in posesDic.keys(): # noun, tagging = key # #print(noun + ' // ' + tagging + '//' + str(posesDic[key])) # wf.write(noun + '\t' + tagging + '\t' + str(posesDic[key]) + '\n') print('sentences analysis is finished....')
class Twitter: def __init__(self, load_default_dictionary=True): self._base = KoNLPyTwitter() self._dictionary = CustomizedDictionary() if load_default_dictionary: self._load_default_dictionary() self._customized_tagger = self._load_customized_tagger() self.tagset = tagset self.add_custom_dictionary('?', 'Punctuation') def _load_customized_tagger(self): templatespath = '%s/data/templates/twitter_templates0' % installpath templates = loadtxt(templatespath) templates = [tuple(template.split()) for template in templates] return SimpleTemplateTagger(templates, self._dictionary, SimpleSelector()) def _load_default_dictionary(self): directory = '%s/data/twitter/' % installpath self._dictionary.add_dictionary(load_dictionary('%s/josa' % directory), 'Josa') self._dictionary.add_dictionary(load_dictionary('%s/noun' % directory, ignore_a_syllable=True), 'Noun') self._dictionary.add_dictionary(load_dictionary('%s/adverb' % directory), 'Adverb') #self._dictionary.add_dictionary(load_dictionary(modifier_dir), 'Modifier') def pos(self, phrase): eojeols = phrase.split() print(eojeols) tagged = [] for idx, eojeol in enumerate(eojeols): tagged0 = self._customized_tagger.pos(eojeol) if tagged0: tagged += tagged0 continue if idx < len(eojeols)-1: tagged0 = self._customized_tagger.pos(eojeol + ' ' + eojeols[idx+1]) if tagged0: tagged += tagged0 continue tagged += self._base.pos(eojeol) return tagged def nouns(self, phrase): tagged = self.pos(phrase) return [w for w, t in tagged if t[0] == 'N'] def morphs(self, phrase, norm=False, stem=False): return [s for s, t in self.pos(phrase)] def phrases(self, phrase): # TODO return self._base.phrases(phrase) def add_dictionary(self, words, tag, force=False): if (not force) and (not (tag in self.tagset)): raise ValueError('%s is not available tag' % tag) self._dictionary.add_dictionary(words, tag) def add_custom_dictionary(self, words, tag, force=True): if (not force) and (not (tag in self.tagset)): raise ValueError('%s is not available tag' % tag) self._dictionary.add_custom_dictionary(words, tag) def load_dictionary(self, fname_list, tag): if not (tag in self.tagset): raise ValueError('%s is not available tag' % tag) self._dictionary.load_dictionary(fname_list, tag) def set_selector(self, my_weight_dict, my_score_function): self._customized_tagger.set_selector(my_weight_dict, my_score_function)
nounsDic).most_common() # 내림차순 정렬 반환값 [(key,value), (key,value)] for key, value in nounsCounter: #print(key + ' // ' + str(value)) if value >= 10: wf.write(key + '\t' + str(value) + '\n') print('sentences analysis is finished....') wf.close() ## analysis 2 : 의미단위 분석 if analysis_type == 'pos': phrasesDic = {} cnt = 1 for sentence in sentences: phrases = twitter.phrases(sentence) ## 의미단위 추출 for phrase in phrases: if phrase in phrasesDic: ## 있다면 +1 now_cnt = phrasesDic[phrase] phrasesDic[phrase] = now_cnt + 1 else: ## 없다면 추가 phrasesDic[phrase] = 1 print('sentence(' + str(cnt) + ') analyzing......') cnt = cnt + 1 wf = codecs.open( 'D:\\smba2_crawler\\result\\happytogether_talk_phrases_result.txt', encoding='utf-8', mode='w') phrasesCounter = Counter(
from konlpy import jvm from konlpy.tag import Twitter from konlpy.tag import Kkma from konlpy.tag import Hannanum from konlpy.tag import Komoran jvm.init_jvm() ######## use Twitter ######## twitter = Twitter() input_text = "한국어를 처리하는 예시입니다" normalizations = twitter.phrases(input_text) tokenizations = twitter.pos(input_text) aftertokenization = twitter.pos(input_text, norm=True) print(' ') print('######## Twitter ########') print('') print('normalization : ') print(normalizations) print('') print('tokenization : ') print(tokenizations) print('') print('tokenization after normalization : ') print(aftertokenization) print('')