def generate_wordCloud(text, font_path, extractNum = 15): hannanum = Hannanum() setFont(font_path) ## mask image image_mask = np.array(Image.open("./utils/visualize/만세_보노.jpg")) cleanText = clean_text(text) words = hannanum.nouns(cleanText) word_list = flatten(words) word_list = pd.Series([x for x in word_list if len(x)>1]) #; print( word_list.value_counts().head(20) ) stopwordList = ['’','”','‘','·','…','"',"'"] wordcloud = WordCloud(font_path=font_path , stopwords=stopwordList , width=800, height=800 , mask=image_mask , background_color='white') count = Counter(word_list) wordcloud = wordcloud.generate_from_frequencies(count) array = wordcloud.to_array() fig = plt.figure(figsize=(10,10)) plt.imshow(array, interpolation='bilinear') plt.axis("off") buf = io.BytesIO() plt.savefig(buf, format='png') buf.seek(0) string = b64encode(buf.read()) wcURI = 'data:image/png;base64,' + urllib.parse.quote(string) count = count.most_common(extractNum) barURI = generate_barchart(count) return wcURI, barURI, count
def test(): setFont() hannanum = Hannanum() #DB Connecion # conn = oci.connect("test/[email protected]:32764/xe", charset='utf8') conn = oci.connect('test','1234','192.168.0.52:32764/xe', encoding='utf-8') df = pd.read_sql('select * from article_sample', conn ) sample1 = df['ARTICLE_CONTENT'][0].read() word = hannanum.nouns(sample1) word_list = flatten(word) word_list = pd.Series([x for x in word_list if len(x)>1]) print( word_list.value_counts().head(20) ) stopwordList = '' wordcloud = WordCloud(font_path=setFontPath() , stopwords=stopwordList , width=800, height=800 , background_color='white') count = Counter(word_list) wordcloud = wordcloud.generate_from_frequencies(count) array = wordcloud.to_array() fig = plt.figure(figsize=(10,10)) plt.imshow(array, interpolation='bilinear') plt.axis("off") plt.show() # plt.savefig('C:/Users/admin/Documents/IMG04.png', bbox_inches='tight')
def konlpyHannanum(inputSentence: str, sentenceList: list) -> dict: han = Hannanum() sentenceDict = dict() inputPos = han.pos(inputSentence) inputPosCount = Counter(inputPos) inputLen = len(inputPosCount) for line in sentenceList: if line == '': continue sentencePos = han.pos(line) sentencePosCount = Counter(sentencePos) sentenceLen = len(sentencePosCount) if sentenceLen >= inputLen: common = 0 for morpheme in inputPosCount: if morpheme in sentencePosCount: common += min(inputPosCount[morpheme], sentencePosCount[morpheme]) similarity = 100 * common / inputLen sentenceDict[line] = similarity else: common = 0 for morpheme in inputPosCount: if morpheme in sentencePosCount: common += min(inputPosCount[morpheme], sentencePosCount[morpheme]) similarity = 100 * common / sentenceLen sentenceDict[line] = similarity return sentenceDict
def text_preprocessing_after(lists): hannanum = Hannanum() getNum = 5 stopword = ['등', '코', '만', '속보', '최초', '4억', '월요일'] cleaning = lambda x: hannanum.nouns(wordcloud01.clean_text(x)) nouns_list = list(map(cleaning, lists)) # print(nouns_list) texts = [value for nouns in nouns_list for value in nouns] total_counter = Counter(texts) for word in stopword: del total_counter[word] result = total_counter.most_common(getNum) return result ## 명사 빈도 추출. ################################################## # def nouns_frequency(text): # print('Kkma 객체 생성') # hannanum = Kkma() # print('텍스트 처리중') # clean_text = wordcloud01.clean_text(text) # print('텍스트 명사 처리중') # words = hannanum.nouns(clean_text) # print('평평하게 만들기') # word_list = wordcloud01.flatten(words) # print('판다스 변환중') # word_list = pd.Series([x for x in word_list if len(x)>1]) # print('result Counter 중') # result = Counter(word_list) # return result
def crawl(self, food_detail_df): print('[Recipe Web Crawling Start]') for index in range(len(food_detail_df)): food = food_detail_df.foodName[index] recipe = self, recipe_finder(food, 2) food_detail_df.loc[index, 'foodRecipe'] = str(recipe) if (index + 1) % 5 == 0: print(round((index + 1) / len(food_df) * 100, 2), 'percent Done') print('Complete!!') print('') print('[noun extract start]') food_detail_df['foodRecipeNoun'] = '' for i in range(len(food_detail_df)): doc = food_detail_df.foodRecipe[i] noun = Hannanum().nouns(doc) cnt = Counter(noun) only_word = [] for key, value in cnt.items(): if int(value) < 3: noun.remove(key) for word in noun: m = re.match('^\D*\D$', word) if m: only_word.append(m.group()) food_detail_df.loc[i, 'foodRecipeNoun'] = str(only_word) if (i % 5) == 0: print(round(i / len(food_detail_df) * 100, 2), ' perent done') print('Complete')
def lineAnalyzer(sentence, analyzeType): hannanum = Hannanum() wordList = list() if (analyzeType == 1): # Nouns wordList = hannanum.nouns(str(sentence)) elif (analyzeType == 2): # Morphs wordList = hannanum.morphs(str(sentence)) elif (analyzeType == 3): # Bi-grams bigram_measures = collocations.BigramAssocMeasures() pos = hannanum.pos(str(sentence)) words = [s for s, t in pos] finder = collocations.BigramCollocationFinder.from_words(words) finder.apply_word_filter(lambda w: len(w) < 2) finder.apply_freq_filter(3) wordList = finder.nbest(bigram_measures.pmi, 10) elif (analyzeType == 4): # Tri-grams trigram_measures = collocations.TrigramAssocMeasures() pos = hannanum.pos(str(sentence)) words = [s for s, t in pos] finder = collocations.TrigramCollocationFinder.from_words(words) finder.apply_word_filter(lambda w: len(w) < 2) finder.apply_freq_filter(3) wordList = finder.nbest(trigram_measures.pmi, 10) else: print("error on top!") return wordList
def getSentenceByWord(): if (request.method == 'POST'): hannanum = Hannanum() word_data = hannanum.pos(request.form['wordData'])[0][0] # print(word_data) sentence_dict = {"sentenceId": 0, "sentenceData": "", "standard": ""} sentence_id_list = [] sentence_list = [] for wd in db_session.query(Word).order_by( Word.wordId).filter(Word.wordData == word_data): sentence_id_list.append(wd.sentenceId) # print(sentence_id_list) for sid in sentence_id_list: sentence = db_session.query(Sentence).filter( Sentence.sentenceId == sid).first() sentence_dict["sentenceId"] = sentence.sentenceId sentence_dict["sentenceData"] = sentence.sentenceData sentence_dict["standard"] = sentence.standard sentence_list.append(sentence_dict.copy()) return json.dumps(sentence_list, ensure_ascii=False)
def get_tags(text, ntags=50, multiplier=10): h = Hannanum() nouns = h.nouns(text) count = Counter(nouns) print(count) return [{'color': color(), 'tag': n, 'size': c * multiplier} \ for n, c in count.most_common(ntags)]
def test(): rss_list = [ # "https://www.reddit.com/", "http://www.chosun.com/site/data/rss/politics.xml", "http://rss.joins.com/joins_politics_list.xml", ] hannanum = Hannanum() # mecab = Macab() for rss_link in rss_list: print("Start get_URLs and read files from : " + rss_link) start_time = time.time() links = get_URLs(rss_link) for link in links: parse_time = time.time() article = get_article(link) file = open("./test/%s.txt" % (article.title), 'w', encoding="utf8") nouns = hannanum.nouns(article.text) # nouns = mecab.nouns(article.text) for noun in nouns: file.write("%s\n" % noun) file.close() parse_time = time.time() - parse_time print("parse files from %s: %f" % (link, parse_time)) start_time = time.time() - start_time print("Process time : %f" % (start_time))
def generate_summary(self, file_name, index, top_n=5): stop_words = read_data(filename='korean_stopwords_list.txt') summarize_text = [] # Step 1 - Read text anc split it sentences = self.read_article(file_name, index) #token화 추가 hannanum = Hannanum() temp = [] for sentence in sentences: temp.append(hannanum.nouns(' '.join(sentence))) # print("temp:",temp) # Step 2 - Generate Similary Martix across sentences sentence_similarity_martix = self.build_similarity_matrix( temp, stop_words) # Step 3 - Rank sentences in similarity martix sentence_similarity_graph = nx.from_numpy_array( sentence_similarity_martix) scores = nx.pagerank(sentence_similarity_graph) # Step 4 - Sort the rank and pick top sentences ranked_sentence = sorted( ((scores[i], s) for i, s in enumerate(sentences)), reverse=True) for i in range(top_n): summarize_text.append(" ".join(ranked_sentence[i][1])) # Step 5 - Offcourse, output the summarize text print("\nSummarize Text: \n", ". ".join(summarize_text))
def get_string(path): f = open(path, "r", encoding="utf-8") sample = f.read() f.close() h = Hannanum() list_nouns = h.nouns(sample) #get list of nouns from sample return listToString(list_nouns) #get string of list_nouns
def make_corpus(self): processor = Hannanum() with open('outputs/sermon-{}-corpus.txt'.format(self.name), 'w') as fout: data_dir = 'data/{}'.format(self.name) for filename in os.listdir(data_dir): if not filename.endswith('txt'): continue path = os.path.join(data_dir, filename) with open(path) as fin: print(path) for line in fin: _line = self.clean_punctuation(line) if not _line: continue _lines = _line.split('.') for l in _lines: _l = self.clean_punctuation(l) if not _l: continue sentence = [ '{}/{}'.format(word, tag) for word, tag in processor.pos(_l) if self.filter_tag(tag) and self.filter_word(word) ] if len(sentence) > 2: fout.write(' '.join(sentence) + '\n')
def tag_article(): """ 주기 : 하루 작업 : 게시물의 명사를 기준으로 테그한다. """ from konlpy.tag import Hannanum hannanum = Hannanum() for article in Article.objects.all(): try: tags = [ tag for tag in hannanum.nouns(article.title) if len(tag) > 1 ] for tag in tags[:10]: splits = re.split(r',', tag) tags.remove(tag) if len(splits) > 1: for split in splits: tags.append(split.strip(SPLIT_CHAR)) else: tags.append(tag.strip(SPLIT_CHAR)) article.do_tag(tags) except Exception as e: print(hannanum.nouns(article.title)) print(e)
def hannanum(sentence): h = Hannanum() if jpype.isJVMStarted(): jpype.attachThreadToJVM() return h.analyze(sentence)
def NLP(self, food_detail_df): print('[noun extract start]') food_detail_df['foodRecipeNoun'] = '' for i in range(len(food_detail_df)): doc = food_detail_df.foodRecipe[i] noun = Hannanum().nouns(doc) for word in noun: word = word.replace('ㅎ', '').replace('ㅋ', '').replace( 'ㅜㅜ', '').replace('ㅠㅠ', '').replace('\\n', '') cnt = Counter(noun) only_word = [] for key, value in cnt.items(): #if (len(key) < 2)|(len(key) > 6): #noun.remove(key) if int(value) < 3: noun.remove(key) for word in noun: m = re.match('^\D*\D$', word) if m: only_word.append(m.group()) food_detail_df.loc[i, 'foodRecipeNoun'] = str(only_word) if (i % 5) == 0: print(round(i / len(food_detail_df) * 100, 2), ' perent done') print('Complete')
def __init__(self): self.komoran = Komoran() self.kkma = Kkma() self.hann = Hannanum() self.mecab = Mecab() self.twitter = Twitter() self.okt = Okt()
def get_derived_query(keyword): # google translate API os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "My Project-a8e42c74ea7e.json" translate_client = translate.Client() # Hannanum pos tagger hannanum = Hannanum() """ Retrieve derived queries from keyword by using WordNet Synset """ nouns = [word for word, pos in hannanum.pos(keyword) if pos == "N"] syn_dict = {} query_list = [] for noun in nouns: result = translate_client.translate(noun, target_language="en") if len(result["translatedText"].split(" ")) > 1: # 복합 명사 처리 안함 continue else: translated_noun = result["translatedText"] # print(noun, translated_noun) for syn in wordnet.synsets(translated_noun): synonyms = [] if syn.pos() == "n": syn_word = syn.name().split(".")[0] synonyms.append(syn_word) syn_dict[noun] = synonyms if len(syn_dict) > 0: for noun in syn_dict: for syn in syn_dict[noun]: syn_ko = translate_client.translate(syn, target_language="ko")["translatedText"] query_list.append(keyword.replace(noun, syn_ko)) return list(np.unique(query_list))
def max_similarity(self): konlpy = Hannanum() l = konlpy.nouns(self.lyrics_input) song_list = self.song_list song_id = 0 max_similarity = 0.0 result = self.compare_lyrics() if result > 0: return result print("입력된 가사의 단어 배열: ", l) for song in song_list: if song['words'] is None: song['words'] = konlpy.nouns(song['lyrics']) print("song_id, title: ", song['song_id'], song['title']) temp = self.measure_similarity(l, song['words']) print("코사인 유사도: ", temp) print() if temp > max_similarity: song_id = song['song_id'] # title 출력을 원한다면 주석해제 # title = song['title'] max_similarity = temp # title 출력을 원한다면 주석해제 return song_id # , title
def preprocess_npm(document): ''' 한국어 체언(n), 용언(p), 수식언(m) 추출해 문장별로 단어 리스트로 정리''' sentences = re.split('\.\s+', document.strip()) results = [] for sent in sentences: try: # 국어영어 아닌 문자 제거 letters_only = re.sub('[^ㄱ-힣a-zA-Z]', ' ', sent) # 형태소 분석기 불러와 단어 분리 hannanum = Hannanum() morp_words = hannanum.pos(letters_only) # 특정 형태소 단어만 선택. 용언의 경우 '다'를 붙여 기본형으로. morph_words = [] for w in morp_words: if w[1] in ['N', 'M']: # N 체언 P 용언 M 수식언 morph_words.append(w[0]) elif w[1] == 'P': morph_words.append(w[0] + "다") else: pass # stopwords 적용해 불용어 제거 stopwords = [ '특파원', '기자', '단독', '앵커', '취재', '특종', '신문', '방송', '보도', '외신', '뉴스' ] # 필요한 불용어 추가 meaningful_words = [w for w in morph_words if w not in stopwords] # 2음절 이상만 선택 meaningful_words2 = [w for w in meaningful_words if len(w) > 1] results.append(meaningful_words2) except: results.append(['']) return results
def divide_with_morpheme(raw_data, total=1): data = [] hannanum = Hannanum() if total == 1: for itr, article in enumerate(raw_data): for sentence in article: #print(sentence) if sentence != '': pos_result = hannanum.morphs(sentence) tmp = " ".join(pos_result) data.append(tmp) print(str(itr)+ 'th article processed') print('last sentence : ' + tmp) return data elif total ==0 : for itr, article in enumerate(raw_data): tmp_data = [] for sentence in article: #print(sentence) if sentence != '': pos_result = hannanum.morphs(sentence) tmp = " ".join(pos_result) tmp_data.append(tmp) print(str(itr)+ 'th article processed') print('last sentence : ' + tmp) data.append(tmp_data) return data
def get_tags(text, ntags=50, multiplier=10): h = Hannanum() nouns = h.nouns(text) count = Counter(nouns) # for word,cnt in count.most_common(ntags): # print(word,cnt) return count
def review_preprocessing(data): # Hannanum package pos_tagger = Hannanum() # 뉴스를 tokenizing한 후, 명사만 추출 pos_nouns = pos_tagger.nouns(data) return ' '.join(pos_nouns)
def text_mining(title_list, ntags=50, multiplier=1): h = Hannanum() data_nouns = [] for title in title_list: data_nouns.extend(h.nouns(title)) count = Counter(data_nouns) return [{'color': color(),'tag':n,'size':int(c*multiplier*0.5)} for n,c in count.most_common(ntags)]
def extract_pos(text): h = Hannanum() pos = h.pos(text, ntags=22, flatten=True) pos_list = [item for item in pos if item[1] == 'NC' or item[1] == 'NQ' or item[1] == 'NN' or item[1] == 'PV' or item[1] == 'PA'] dct = dict(pos_list) for stopword in stopwords.itertuples(): # 불용어 체크 if dct.get(stopword._1): del dct[stopword._1] split_pos = "|".join("%s,%s" % tup for tup in dct.items()) return split_pos
def comment_freq(youtube_data): # youtuber_csv_data = dm.GetData(url, con) # if youtuber_csv_data == None: # print("데이터 없음") # return None # video_num = int(input("몇 번 동영상을 분석할까요 ? ")) # youtube_data = dm.GetData(youtuber_csv_data[video_num][0], password) >> main.py에서 구현 if youtube_data == None: return None comment = [] for i in range(len(youtube_data)): comment.append(youtube_data[i][2]) emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) "]+", flags=re.UNICODE) han = re.compile(r'[ㄱ-ㅎㅏ-ㅣ!?~,".\n\r#\ufeff\u200d]') comment_noemot = [] for i in comment: tokens = re.sub(emoji_pattern, "", i) tokens = re.sub(han, "", tokens) comment_noemot.append(tokens) nouns = [] h = Hannanum() for i in comment_noemot: n = h.nouns(i) nouns.append(n) noun_list = [] for i in range(len(nouns)): for j in range(len(nouns[i])): noun_list.append(nouns[i][j]) counts = Counter(noun_list) tags = counts.most_common(30) wc = WordCloud(font_path='C:\\Windows\\Fonts\\gulim.ttc', background_color='black', width=800, height=600) cloud = wc.generate_from_frequencies(dict(tags)) cloud plt.figure(figsize=(10, 8)) plt.axis('off') plt.imshow(cloud) plt.show()
def sentiment_analysis(tweet, tweets): #1) 형태소로 나눈다. content = tweet[1] content_morphs = [] hannanum = Hannanum() content_morphs = hannanum.morphs(content) # print("형태소 분류: ", content_morphs) #1)-2 띄어쓰기로 나눈다. space = content.split(" ") print(space) #2) 불용어 제거하기 # 2) -1 불용어 불어오기 stopwords => 불용어 리스트 stopwords_file = open("불용어.txt", 'r', encoding='utf-8') stopwords = [] lines = stopwords_file.readlines() for line in lines: line = line.replace("\n", "") stopwords.append(line) # 2) -2 불용어 제거하기 for i in content_morphs: if i in stopwords: content_morphs.remove(i) # print("불용어 제거: " ,content_morphs) #3) 형태소별 극성 계산 # data: 감성사전 with open('data/SentiWord_info.json', encoding='utf-8-sig', mode='r') as f: data = json.load(f) score = 0 for wordname in space: for i in range(0, len(data)): #어근 비교 및 단어 비교를 같이 한다. if (data[i]['word_root'] == wordname) or (data[i]['word'] == wordname): if data[i]['polarity'] != "None": score += int(data[i]['polarity']) break if score > 0: polarity = "positive" elif score == 0: polarity = "neutral" else: polarity = "negative" tweet[4] = polarity # hashtag_sentiment_analysis(tweet, tweets) tweets.append(tweet) print("content: ", content) print("polarity: ", polarity)
def __init__(self, tech_words, double_words, triple_words, syns_words): self.puctuation = re.compile('[!"$%&\'()*,-/:;<=>?@[\\]^_`{|}~]') self.hannanum = Hannanum() with open(tech_words, 'rb') as f: self.tech_words = pickle.load(f) with open(double_words, 'rb') as f: self.double_words = pickle.load(f) with open(triple_words, 'rb') as f: self.triple_words = pickle.load(f) with open(syns_words, 'rb') as f: self.syns_words = pickle.load(f)
def __init__(self): self.parser = reqparse.RequestParser() print("LoadSrcFile init") self.parser.add_argument("group_path", type=str, location="json") self.token_manager = TokenManager.instance() print("self.parser.parse_args() : ", self.parser.parse_args()) self.group_path = self.parser.parse_args()["group_path"] self.t = Okt() self.ha = Hannanum() super(LoadSrcFile, self).__init__()
def tokenization(cleaned_docs): han = Hannanum() tokenized_docs = [] while ' ' in cleaned_docs: cleaned_docs.remove(' ') for doc in cleaned_docs: nouns_in_doc = [] for noun in han.nouns(doc): if len(noun) > 1: nouns_in_doc.append(noun) tokenized_docs.append(nouns_in_doc) return tokenized_docs
def update_words_all(): hannanum = Hannanum() db = db_connector.DbConnector() song_list = db.select_all() for song in song_list: if song['lyrics'] is not None and song['words'] is None: words = hannanum.nouns(song['lyrics']) words = sorted(set(words)) update_words(song['song_id'], ' '.join(words)) print('Words extraction done!')
def wordAnalysis(text): myHannanum = Hannanum() print("text : " + text) replace_text = re.sub("[!@#$%^&*()_+]", " ", text) print("replace_text : " + replace_text) analysis_text = (" ".join(myHannanum.nouns(replace_text))) return analysis_text
def reduceToWords(self): hannanum = Hannanum() words = '' #for word in hannanum.nouns(unicode(texts, 'UTF-8')): if (self.result != ''): for word in hannanum.nouns(self.result): word = re.sub("[(*&]", "", word) if (len(word) > 1): words = word + '\n' + words #for end self.result = words print words # if end return self
def reduceToWords(self) : hannanum = Hannanum() words = '' #for word in hannanum.nouns(unicode(texts, 'UTF-8')): if(self.result != '') : for word in hannanum.nouns(self.result): word = re.sub("[(*&]", "", word) if(len(word) > 1): words = word + '\n' + words #for end self.result = words print words # if end return self
def __init__(self): """Initialize unigram class this method initialize all attributes of this class """ self.compare_set = [] self.hannanum = Hannanum() self.ke_object = KE() self.set = ['pure', 'pure_number', 'pure_punctuation', 'pure_number_punctuation']
def hannanum_analyze(content): dictionary={} h= Hannanum() words = h.pos(content) for tuple in words: value = tuple[1] if value == "N" or value == "P" or value == "M": key = tuple[0] # '먹' 같은 용언에는 '-다'를 붙여 '먹다' 같은 동사로 키값 사용 if value == "P": key += u"다" if not key in dictionary.keys(): dictionary[key] =1 else : dictionary[key] +=1 return OrderedDict(sorted(dictionary.items(), key=itemgetter(1), reverse=True))
def __init__(self, on_han=False, on_twitter=False, on_mecab=False): # maybe move to init of analysis_app """ Allocate kkma or twitter diction instance :param on_han: han instance :param on_twitter: twitter instance :param on_mecab: mecab instance """ if on_han is True: self.han = Hannanum() if on_twitter is True: self.twitter = Twitter()
def hannanum_analyze_22(): h= Hannanum() tags={ 'NC':'보통명사', 'NQ':'고유명사', } """ pos_dics= { news.id : 분석 결과 dictionary, news.id : 분석 결과 dictionary, ... } """ pos_dics = {} news = getTodayNews() for n in news : content = remove_puc_marks(n.content) # 문장 부호 제거 words_dic = h.pos(content,22) # 형태소 제거 dictionary={} for t in words_dic: word = t[0] key = t[1] if key in tags.keys(): if not word in dictionary.keys(): dictionary[word] =1 else : dictionary[word] +=1 dictionary=remove_stopwords(dictionary) # 불용어 제거 pos_dics[n]=dictionary print "tf-idf" analyzed_dics=tf_idf_map(pos_dics) # tfidf 계산 return analyzed_dics
def hannanum_analyze_22_key(content): dictionary={} h= Hannanum() tags={ 'NC':'보통명사', 'NQ':'고유명사', # 'NB':'의존명사', # 'NN':'수사' , # 'NP':'대명사' , # 'PV':'동사', # 'PA':'형용사', # 'PX':'보조 용언', # 'MM':'관형사' , # 'MA':'부사', } words = h.pos(content,22) for t in words: key = t[0] value = t[1] if value in tags.keys(): # '먹' 같은 용언에는 '-다'를 붙여 '먹다' 같은 동사로 키값 사용 if value.startswith("P"): key += u"다" key= key+ "["+value+"]" if not key in dictionary.keys(): dictionary[key] =1 else : dictionary[key] +=1 # print key + " " + value dictionary=remove_stopwords(dictionary) # 불용어 제거 dictionary=OrderedDict(sorted(dictionary.items(), key=itemgetter(1), reverse=True)) return dictionary
def get_tags(self,text, ntags=10, multiplier=10): h = Hannanum() nouns = h.nouns(text) count = Counter(nouns) return [{'tag': n, 'size': c*multiplier }\ for n, c in count.most_common(ntags)]
def analysis(self, blog_review_url): # self.logger.info(blog_review_url) analysis_checker = {} try: r = requests.get(blog_review_url) except requests.ConnectionError as e: return soup = BeautifulSoup(r.text) r.close() try: blog_review_url = soup.select('#screenFrame')[0]['src'] # self.logger.info("regenerated:"+blog_review_url) r = requests.get(blog_review_url) soup = BeautifulSoup(r.text) r.close() except Exception as e: pass try: real_blog_review_url = "http://blog.naver.com" + soup.select('frame#mainFrame')[0]['src'] except IndexError as e: self.skip_count += 1 return r = requests.get(real_blog_review_url) soup = BeautifulSoup(r.text) r.close() post_view = soup.select('.post-view')[0] p_list = post_view.select('p') raw_str_list = [] for item in p_list: p_str = str(item.text.encode('utf-8')).replace('\xc2\xa0', ' ').replace('\xe2\x80\x8b', ' ').strip() p_str = p_str.replace('ㅎ', '').replace('ㅋ', '') if len(p_str) != 0: raw_str_list.append(p_str.decode('utf-8')) kkma = Hannanum() for raw_str_item in raw_str_list: if len(raw_str_item) >= 100: self.skip_count += 1 continue try: raw_str_item = raw_str_item.strip() pos_tuple = kkma.pos(raw_str_item) for pos_tuple_item in pos_tuple: item = pos_tuple_item[0] item_type = pos_tuple_item[1] if not (analysis_checker.has_key(item)) and ( item_type.startswith('N') or item_type.startswith('V') or item_type.startswith( 'M') or item_type.startswith('XR') or item_type.startswith('U')): if self.analysis_result.has_key(item): analysis_item_count = self.analysis_result.get(item) + 1 else: analysis_item_count = 1 self.analysis_result.update({ item: analysis_item_count }) analysis_checker.update({ item: 1 }) except jpype.JavaException as exception: pass
def get_tags(text, ntags=int(sys.argv[2]), multiplier=10): h = Hannanum() nouns = h.nouns(text) count = Counter(nouns) return [{ 'color': color(), 'tag': n, 'size': c*multiplier }\ for n, c in count.most_common(ntags)]
def get_tags(text, ntags=50, multiplier=10): h = Hannanum() nouns = h.nouns(text) count = Counter(nouns) return [{"color": color(), "tag": n, "size": c * multiplier} for n, c in count.most_common(ntags)]
def WordCount(corpus): h = Hannanum() nouns = h.nouns(corpus) frequency = Counter(nouns) return frequency
class AnalysisDiction: """ This class is for analysis of korean texts using kkma and twitter dictionaries """ def __init__(self, on_han=False, on_twitter=False, on_mecab=False): # maybe move to init of analysis_app """ Allocate kkma or twitter diction instance :param on_han: han instance :param on_twitter: twitter instance :param on_mecab: mecab instance """ if on_han is True: self.han = Hannanum() if on_twitter is True: self.twitter = Twitter() # if on_mecab is True: # self.mecab = Mecab() def analyzer_hannaum(self, string_data, mode): """ This method is for hannanum. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._hannanum """ if mode is 'morphs': return self.han.morphs(string_data) elif mode is 'nouns': return self.han.nouns(string_data) elif mode is 'pos': return self.han.pos(string_data) else: return False def analyzer_mecab(self, string_data, mode): """ This method is for mecab. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#mecab-class """ if mode is 'morphs': return self.mecab.morphs(string_data) elif mode is 'nouns': return self.mecab.nouns(string_data) elif mode is 'pos': return self.mecab.pos(string_data) else: return False def analyzer_twitter(self, string_data, mode): """ This method is for twitter. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._twitter """ if mode is 'morphs': return self.twitter.morphs(string_data) elif mode is 'nouns': return self.twitter.nouns(string_data) elif mode is 'pos': return self.twitter.pos(string_data) elif mode is 'posmore': return self.twitter.pos(string_data, True, True) else: return False
class Word(object): """Analyze pharse set by word Attributes: compare_set (dict): for correlation calculation hannanum (object): for morphere analyzing ke (object): for korean analying set (list): 4 parent sets """ def __init__(self): """Initialize unigram class this method initialize all attributes of this class """ self.compare_set = [] self.hannanum = Hannanum() self.ke_object = KE() self.set = ['pure', 'pure_number', 'pure_punctuation', 'pure_number_punctuation'] def analyze(self, analyze_path, output_path, filename): """Analyze pharse set from target filename by word Args: analyze_path (str): target input file's path output_path (str): output file's path filename (str): target filename """ input_file = filename + '.txt' output_file = 'word_analyze_' + filename + '.txt' with open(analyze_path + input_file, 'r') as file_read: word_list = [] for line in file_read: if len(line.strip()) == 0: continue #self.hannanum.morphs(line.decode('utf-8')) #for word in line.decode('utf-8').split(' '): try: morphs_list = self.hannanum.morphs(line.decode('utf-8')) except UnicodeDecodeError as e: morphs_list = self.hannanum.morphs(line.encode('utf-8')) for word in morphs_list: changed = self.ke_object.change_complete_korean(word, 3) word_item = "".join(changed) word_list.append(word_item) if filename in self.set: if not word_item in self.compare_set: self.compare_set.append(word_item) #self.copy_set_file(filename, word_list) final = self.update_dict(word_list) if not filename in self.set: for key in self.compare_set: if not key in final.keys(): final[key] = 0 with open(output_path + output_file, 'w') as file_write: for key, value in final.iteritems(): file_write.write(str(key) + ' : ' + str(value) + '\n') def copy_set_file(self, filename, result): """Copy parent set file from filename for compare set Args: filename (str): target filename result (list): target result """ if filename in self.set: self.compare_set = copy.deepcopy(result) @staticmethod def update_dict(result): """generate dict for correlation calc as analyze result Args: result (list): target result Return: final (dict): calculated dict """ final = {} for item in result: if item in final.keys(): updated = final[item] del final[item] final[item] = updated + 1 else: final[item] = 1 return final