def tag_article(): """ 주기 : 하루 작업 : 게시물의 명사를 기준으로 테그한다. """ from konlpy.tag import Hannanum hannanum = Hannanum() for article in Article.objects.all(): try: tags = [ tag for tag in hannanum.nouns(article.title) if len(tag) > 1 ] for tag in tags[:10]: splits = re.split(r',', tag) tags.remove(tag) if len(splits) > 1: for split in splits: tags.append(split.strip(SPLIT_CHAR)) else: tags.append(tag.strip(SPLIT_CHAR)) article.do_tag(tags) except Exception as e: print(hannanum.nouns(article.title)) print(e)
def max_similarity(self): konlpy = Hannanum() l = konlpy.nouns(self.lyrics_input) song_list = self.song_list song_id = 0 max_similarity = 0.0 result = self.compare_lyrics() if result > 0: return result print("입력된 가사의 단어 배열: ", l) for song in song_list: if song['words'] is None: song['words'] = konlpy.nouns(song['lyrics']) print("song_id, title: ", song['song_id'], song['title']) temp = self.measure_similarity(l, song['words']) print("코사인 유사도: ", temp) print() if temp > max_similarity: song_id = song['song_id'] # title 출력을 원한다면 주석해제 # title = song['title'] max_similarity = temp # title 출력을 원한다면 주석해제 return song_id # , title
def lineAnalyzer(sentence, analyzeType): hannanum = Hannanum() wordList = list() if (analyzeType == 1): # Nouns wordList = hannanum.nouns(str(sentence)) elif (analyzeType == 2): # Morphs wordList = hannanum.morphs(str(sentence)) elif (analyzeType == 3): # Bi-grams bigram_measures = collocations.BigramAssocMeasures() pos = hannanum.pos(str(sentence)) words = [s for s, t in pos] finder = collocations.BigramCollocationFinder.from_words(words) finder.apply_word_filter(lambda w: len(w) < 2) finder.apply_freq_filter(3) wordList = finder.nbest(bigram_measures.pmi, 10) elif (analyzeType == 4): # Tri-grams trigram_measures = collocations.TrigramAssocMeasures() pos = hannanum.pos(str(sentence)) words = [s for s, t in pos] finder = collocations.TrigramCollocationFinder.from_words(words) finder.apply_word_filter(lambda w: len(w) < 2) finder.apply_freq_filter(3) wordList = finder.nbest(trigram_measures.pmi, 10) else: print("error on top!") return wordList
def generate_wordCloud(text, font_path, extractNum = 15): hannanum = Hannanum() setFont(font_path) ## mask image image_mask = np.array(Image.open("./utils/visualize/만세_보노.jpg")) cleanText = clean_text(text) words = hannanum.nouns(cleanText) word_list = flatten(words) word_list = pd.Series([x for x in word_list if len(x)>1]) #; print( word_list.value_counts().head(20) ) stopwordList = ['’','”','‘','·','…','"',"'"] wordcloud = WordCloud(font_path=font_path , stopwords=stopwordList , width=800, height=800 , mask=image_mask , background_color='white') count = Counter(word_list) wordcloud = wordcloud.generate_from_frequencies(count) array = wordcloud.to_array() fig = plt.figure(figsize=(10,10)) plt.imshow(array, interpolation='bilinear') plt.axis("off") buf = io.BytesIO() plt.savefig(buf, format='png') buf.seek(0) string = b64encode(buf.read()) wcURI = 'data:image/png;base64,' + urllib.parse.quote(string) count = count.most_common(extractNum) barURI = generate_barchart(count) return wcURI, barURI, count
def generate_summary(self, file_name, index, top_n=5): stop_words = read_data(filename='korean_stopwords_list.txt') summarize_text = [] # Step 1 - Read text anc split it sentences = self.read_article(file_name, index) #token화 추가 hannanum = Hannanum() temp = [] for sentence in sentences: temp.append(hannanum.nouns(' '.join(sentence))) # print("temp:",temp) # Step 2 - Generate Similary Martix across sentences sentence_similarity_martix = self.build_similarity_matrix( temp, stop_words) # Step 3 - Rank sentences in similarity martix sentence_similarity_graph = nx.from_numpy_array( sentence_similarity_martix) scores = nx.pagerank(sentence_similarity_graph) # Step 4 - Sort the rank and pick top sentences ranked_sentence = sorted( ((scores[i], s) for i, s in enumerate(sentences)), reverse=True) for i in range(top_n): summarize_text.append(" ".join(ranked_sentence[i][1])) # Step 5 - Offcourse, output the summarize text print("\nSummarize Text: \n", ". ".join(summarize_text))
def get_string(path): f = open(path, "r", encoding="utf-8") sample = f.read() f.close() h = Hannanum() list_nouns = h.nouns(sample) #get list of nouns from sample return listToString(list_nouns) #get string of list_nouns
def get_tags(text, ntags=50, multiplier=10): h = Hannanum() nouns = h.nouns(text) count = Counter(nouns) print(count) return [{'color': color(), 'tag': n, 'size': c * multiplier} \ for n, c in count.most_common(ntags)]
def test(): setFont() hannanum = Hannanum() #DB Connecion # conn = oci.connect("test/[email protected]:32764/xe", charset='utf8') conn = oci.connect('test','1234','192.168.0.52:32764/xe', encoding='utf-8') df = pd.read_sql('select * from article_sample', conn ) sample1 = df['ARTICLE_CONTENT'][0].read() word = hannanum.nouns(sample1) word_list = flatten(word) word_list = pd.Series([x for x in word_list if len(x)>1]) print( word_list.value_counts().head(20) ) stopwordList = '' wordcloud = WordCloud(font_path=setFontPath() , stopwords=stopwordList , width=800, height=800 , background_color='white') count = Counter(word_list) wordcloud = wordcloud.generate_from_frequencies(count) array = wordcloud.to_array() fig = plt.figure(figsize=(10,10)) plt.imshow(array, interpolation='bilinear') plt.axis("off") plt.show() # plt.savefig('C:/Users/admin/Documents/IMG04.png', bbox_inches='tight')
def text_preprocessing_after(lists): hannanum = Hannanum() getNum = 5 stopword = ['등', '코', '만', '속보', '최초', '4억', '월요일'] cleaning = lambda x: hannanum.nouns(wordcloud01.clean_text(x)) nouns_list = list(map(cleaning, lists)) # print(nouns_list) texts = [value for nouns in nouns_list for value in nouns] total_counter = Counter(texts) for word in stopword: del total_counter[word] result = total_counter.most_common(getNum) return result ## 명사 빈도 추출. ################################################## # def nouns_frequency(text): # print('Kkma 객체 생성') # hannanum = Kkma() # print('텍스트 처리중') # clean_text = wordcloud01.clean_text(text) # print('텍스트 명사 처리중') # words = hannanum.nouns(clean_text) # print('평평하게 만들기') # word_list = wordcloud01.flatten(words) # print('판다스 변환중') # word_list = pd.Series([x for x in word_list if len(x)>1]) # print('result Counter 중') # result = Counter(word_list) # return result
def test(): rss_list = [ # "https://www.reddit.com/", "http://www.chosun.com/site/data/rss/politics.xml", "http://rss.joins.com/joins_politics_list.xml", ] hannanum = Hannanum() # mecab = Macab() for rss_link in rss_list: print("Start get_URLs and read files from : " + rss_link) start_time = time.time() links = get_URLs(rss_link) for link in links: parse_time = time.time() article = get_article(link) file = open("./test/%s.txt" % (article.title), 'w', encoding="utf8") nouns = hannanum.nouns(article.text) # nouns = mecab.nouns(article.text) for noun in nouns: file.write("%s\n" % noun) file.close() parse_time = time.time() - parse_time print("parse files from %s: %f" % (link, parse_time)) start_time = time.time() - start_time print("Process time : %f" % (start_time))
def parse(df, _type: str): """Parse function""" # Parser korean_parser = Hannanum() neg = df[df['label'] == 0]['document'].tolist() pos = df[df['label'] == 1]['document'].tolist() nouns_doc_f = open('./input/nouns_{}_documents.txt'.format(_type), 'w') nouns_label_f = open('./input/nouns_{}_labels.txt'.format(_type), 'w') morphs_doc_f = open('./input/morphs_{}_documents.txt'.format(_type), 'w') morphs_label_f = open('./input/morphs_{}_labels.txt'.format(_type), 'w') logger.info("Starting parsing...") for doc in neg: try: nouns_doc_f.write(','.join(korean_parser.nouns(doc)) + '\n') nouns_label_f.write('{}\n'.format(0)) except: pass try: morphs_doc_f.write(','.join(korean_parser.morphs(doc)) + '\n') morphs_label_f.write('{}\n'.format(0)) except: pass logger.info('%s pos document parsing completed.' % _type) for doc in pos: try: nouns_doc_f.write(','.join(korean_parser.nouns(doc)) + '\n') nouns_label_f.write('{}\n'.format(1)) except: pass try: morphs_doc_f.write(','.join(korean_parser.morphs(doc)) + '\n') morphs_label_f.write('{}\n'.format(1)) except: pass logger.info('%s neg document parsing completed.' % _type) nouns_doc_f.close() nouns_label_f.close() morphs_doc_f.close() morphs_label_f.close()
def get_tags(text, ntags=50, multiplier=10): h = Hannanum() nouns = h.nouns(text) count = Counter(nouns) # for word,cnt in count.most_common(ntags): # print(word,cnt) return count
def review_preprocessing(data): # Hannanum package pos_tagger = Hannanum() # 뉴스를 tokenizing한 후, 명사만 추출 pos_nouns = pos_tagger.nouns(data) return ' '.join(pos_nouns)
def text_mining(title_list, ntags=50, multiplier=1): h = Hannanum() data_nouns = [] for title in title_list: data_nouns.extend(h.nouns(title)) count = Counter(data_nouns) return [{'color': color(),'tag':n,'size':int(c*multiplier*0.5)} for n,c in count.most_common(ntags)]
def comment_freq(youtube_data): # youtuber_csv_data = dm.GetData(url, con) # if youtuber_csv_data == None: # print("데이터 없음") # return None # video_num = int(input("몇 번 동영상을 분석할까요 ? ")) # youtube_data = dm.GetData(youtuber_csv_data[video_num][0], password) >> main.py에서 구현 if youtube_data == None: return None comment = [] for i in range(len(youtube_data)): comment.append(youtube_data[i][2]) emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) "]+", flags=re.UNICODE) han = re.compile(r'[ㄱ-ㅎㅏ-ㅣ!?~,".\n\r#\ufeff\u200d]') comment_noemot = [] for i in comment: tokens = re.sub(emoji_pattern, "", i) tokens = re.sub(han, "", tokens) comment_noemot.append(tokens) nouns = [] h = Hannanum() for i in comment_noemot: n = h.nouns(i) nouns.append(n) noun_list = [] for i in range(len(nouns)): for j in range(len(nouns[i])): noun_list.append(nouns[i][j]) counts = Counter(noun_list) tags = counts.most_common(30) wc = WordCloud(font_path='C:\\Windows\\Fonts\\gulim.ttc', background_color='black', width=800, height=600) cloud = wc.generate_from_frequencies(dict(tags)) cloud plt.figure(figsize=(10, 8)) plt.axis('off') plt.imshow(cloud) plt.show()
def tokenization(cleaned_docs): han = Hannanum() tokenized_docs = [] while ' ' in cleaned_docs: cleaned_docs.remove(' ') for doc in cleaned_docs: nouns_in_doc = [] for noun in han.nouns(doc): if len(noun) > 1: nouns_in_doc.append(noun) tokenized_docs.append(nouns_in_doc) return tokenized_docs
def update_words_all(): hannanum = Hannanum() db = db_connector.DbConnector() song_list = db.select_all() for song in song_list: if song['lyrics'] is not None and song['words'] is None: words = hannanum.nouns(song['lyrics']) words = sorted(set(words)) update_words(song['song_id'], ' '.join(words)) print('Words extraction done!')
def wordAnalysis(text): myHannanum = Hannanum() print("text : " + text) replace_text = re.sub("[!@#$%^&*()_+]", " ", text) print("replace_text : " + replace_text) analysis_text = (" ".join(myHannanum.nouns(replace_text))) return analysis_text
def reduceToWords(self): hannanum = Hannanum() words = '' #for word in hannanum.nouns(unicode(texts, 'UTF-8')): if (self.result != ''): for word in hannanum.nouns(self.result): word = re.sub("[(*&]", "", word) if (len(word) > 1): words = word + '\n' + words #for end self.result = words print words # if end return self
def extract(self): # extract nouns with Komoran hnn = Hannanum() # merge = str(self.merge_sentence.encode('utf-8'), encoding='utf-8') # 인코딩 문제 해결 못함 * merge = self.merge_sentence nouns = hnn.nouns(merge) for n in nouns: # 특수문자 제거 n = n.replace("'", "").replace(",", "") processed = [n for n in nouns if len(n) >= 2] # min length 2 count = Counter(processed) self.tags = count.most_common(20) # max character 20
def reduceToWords(self) : hannanum = Hannanum() words = '' #for word in hannanum.nouns(unicode(texts, 'UTF-8')): if(self.result != '') : for word in hannanum.nouns(self.result): word = re.sub("[(*&]", "", word) if(len(word) > 1): words = word + '\n' + words #for end self.result = words print words # if end return self
def crawl(): global hannanum if hannanum == None: hannanum = Hannanum() if jpype.isJVMStarted(): jpype.attachThreadToJVM() hannanum = Hannanum() media = Media.objects.all() articles = Article.objects.all() count = 0 all = 0 for medium in media: links = get_URLs(medium.rss_list) #print(links) upper_bound = len(links) all += upper_bound for link in links: #print(link) if Article.objects.filter(article_url=link).exists(): continue try: article = get_article(link) except: print("Fail:%s" % link) continue #print(link) title = article.title content = article.text nouns = hannanum.nouns(article.text) morphemed_content = " ".join(nouns) writer = '' if len(article.authors) == 0: writer = 'anonymous' else: writer = article.authors[0] try: articles.create( title=title, content=content, morphemed_content=morphemed_content, media=medium, writer=writer, article_url=link, ) count += 1 except: print("Fils:%s,title:%s" % (link, title)) continue return (count, all)
def insert_summary(): mongoDB = myMongoDB("CapstoneTest") #fasttext.util.download_model('ko', if_exists='ignore') ft = fasttext.load_model('./models/cc.ko.300.bin') total_clean_sentence = [] string_id = [] for content in mongoDB.collected.find({}, {"_id": 1, "content": 1}): cleaned_sentence = [] clean_sentence = [] string_id.append(list(content.values())[0]) string = list(content.values())[1] string = string.replace(u'\xa0', u' ') string = string.replace(u'\n', u' ') string = string.replace(u'\r', u' ') clean_sentence.append(sent_tokenize(string)) for i in clean_sentence: for j in i: cleaned_sentence.append(j) total_clean_sentence.append(cleaned_sentence) temp = [] hannanum = Hannanum() for clean_sentence in total_clean_sentence: for s in clean_sentence: noun = hannanum.nouns(s) for i in noun: temp.append(i) for i in temp: word_vector_arr = np.asarray(ft[i], dtype='float32') word_dict[i] = word_vector_arr string_idx = 0 for clean_sentence in total_clean_sentence: article_embedding = articles_to_vectors(clean_sentence) similar_matrix = similarity_matrix(article_embedding) score = calculate_score(similar_matrix) summaryShort_list = summaryShort(clean_sentence, score) summaryMed_list = summaryMed(clean_sentence, score) summaryLong_list = summaryLong(clean_sentence, score) mongoDB.collected.update_one({'_id': string_id[string_idx]}, { '$set': { 'sum_short': summaryShort_list, 'sum_mid': summaryMed_list, 'sum_long': summaryLong_list } }) string_idx += 1
def post( self, request, format=None ): #JSON: "key" : "value" --> "searchWord" : "보온보냉팩 버리는 방법 좀 알려줘?" searchSentence = request.data['searchWord'] #안드로이드에서 searchWord 입력해야함 if "캔" in searchSentence: print("Okt") okt = Okt() Nouns = okt.nouns(searchSentence) else: print("Hannanum") ha = Hannanum() Nouns = ha.nouns(searchSentence) print('nouns: ', Nouns) Idx = [] temp = [] small_list = list() for word in Nouns: smallIdx = WasteCategoryS.objects.filter(cg_name__contains=word) for val in smallIdx: small_list.append(val) if len(smallIdx) == 0: print('len 0') middleIdx = WasteCategoryM.objects.filter( cg_name__contains=word) for ob in middleIdx: Idx.append(ob.idx) continue for ob in smallIdx: Idx.append(ob.cg_middle_idx.idx) print(Idx) dischargeTipsList = [] for idx in Idx: dischargeTipsList.append( DischargeTips.objects.get(category_m_idx=idx)) serializer = DischargeTipsSerializer(dischargeTipsList, many=True) waste_serializer = WasteCategorySSerializer(small_list, many=True) return Response( { "matching_name": waste_serializer.data, "textVoiceDischargeTips": serializer.data }, status=status.HTTP_201_CREATED)
def main(args=None): print("Loading pdf files...") outfp = extract_text(files=search("./sample_pdf"), outfile="temp.txt") outfp.close() print("Loading data...") f = open("temp.txt", encoding='UTF8') elements = f.readlines() elements = [x for x in elements if x != "\n"] elements = [x.rstrip() for x in elements] hannanum = Hannanum() korean_list = [] korean_noun_list = [] english_list = [] korean = re.compile('[^ ㄱ-ㅣ가-힣]+') for element in elements: korean_list.append(korean.sub("", element)) korean_list = [x.strip() for x in korean_list] korean_list = [x for x in korean_list if x != ''] print("Parsing Korean words...") for korean in korean_list: korean_noun_list += hannanum.nouns(korean) korean_noun_list = [x for x in korean_noun_list if len(x) > 1] print("Korean list : ", korean_noun_list) print("Parsing English words...") for element in elements: english_list.append(re.sub('[^a-zA-Z]', '', element)) english_list = [x.strip() for x in english_list] english_list = [x for x in english_list if x != ''] print("English list : ", english_list) korean_counter = collections.Counter(korean_noun_list) print("Most 10 word in Korean words:", korean_counter.most_common(10)) english_counter = collections.Counter(english_list) print("Most 10 word in English words:", english_counter.most_common(10)) draw_word_cloud(korean_noun_list, "Korean") draw_word_cloud(english_list, "English") print("Done")
def draw_cloud(reviews): tags = {} # r = lambda: random.randint(0,255) # color = lambda: (r(), r(), r()) for review in reviews: h = Hannanum() nouns = h.nouns(review) count = dict(Counter(nouns)) tags = { k: tags.get(k, 0) + count.get(k, 0) for k in set(tags) | set(count) } gen_stylecloud(text=tags, output_name="wordcloud.png", icon_name="fas fa-square-full", background_color="white", font_path="Jua-Regular.ttf", size=1024)
def text_preprocessing(queryset): # print(queryset,len(queryset)) hannanum = Hannanum() getNum = 5 stopword = ['등', '코', '만', '속보', '최초', '4억', '월요일'] df = pd.DataFrame.from_records(queryset) # print(df, type(df)) # df['title_nouns'] = df['article_title'].apply( lambda x : hannanum.nouns( wordcloud01.clean_text( x ) ) ); print(df['title_nouns']); print(df['title_nouns'].sum()) # print('apply시작') df['title_nouns'] = df['article_title'].apply( lambda x: Counter(hannanum.nouns(wordcloud01.clean_text(x)))) # print('sum시작') total_counter = df['title_nouns'].sum() # print('stopword') for word in stopword: del total_counter[word] # print(type(total_counter), total_counter.most_common( getNum )) result = total_counter.most_common(getNum) return result
def rec_hashtag(request): """ 해시태그 추천 시 사용할 API --- """ content = request.data.get('article') hannanum = Hannanum() keywords = hannanum.nouns(content) count = Counter(keywords) count = sorted(sorted(count.items(), key=operator.itemgetter(0)), key=lambda x: x[1], reverse=True) data = [] for k in count: data.append(k[0]) if len(data) > 2: break return Response(data)
def morp_analysis(self): noun_text = re.sub('[-_=+,#/\?:^$.@*\"※~&%ㆍ·!』\\‘’|\(\)\[\]\<\>`\'…》]', '', str(self)) noun_text = re.sub('\n', '', noun_text) hannanum = Hannanum() text_list = hannanum.nouns(noun_text) # 명사 분석 word_list = pd.Series(text_list) result = word_list.value_counts().head(10) result_values = list(result.values) for i in range(len(result_values)): result_values[i] = np.int16(result_values[i]).item() freq_lst = [] for i in range(len(result)): freq_lst.append({'word': result.keys()[i], 'freq': result_values[i]}) return freq_lst
def get_tags(text, ntags=50, multiplier=2): h = Hannanum() nouns = h.nouns(text) long_nouns = list() for n in nouns: if len(n) >= 2: # 길이가 2 이상인 명사만 출력 long_nouns.append(n) count = Counter(long_nouns) word_list = list() for w in count.most_common(ntags): if w[1] >= 10: # 10번 이상 나온 명사만 출력 print(w) word_list.append(w) r = lambda: random.randint(0, 255) color = lambda: (r(), r(), r()) return [{'color': color(), 'tag': n, 'size': c*multiplier}\ for n, c in word_list]
def wordcloud_textmining(text): H = Hannanum() twttierMS = np.array(Image.open('./static/images/동그라미.png')) a = " ".join(H.nouns(text)) wc = WordCloud(font_path="./font/BMEULJIROTTF.ttf", background_color="white", width=1000, height=1000, mask=twttierMS, max_words=150, max_font_size=200) wc.generate(a) fig = plt.figure() plt.imshow(wc, interpolation='bilinear') plt.axis('off') plt.savefig('./static/Wordcloud.png') return fig
def get_text(): full_data = "" #폴더내의 모든 파일 가지고 오기 files = glob.glob(file_dir + '/*.json') corpus = list() h = Hannanum() for fname in files: # print("fname") print(fname) file_name = fname.split('\\')[1] file_idx = file_name.split('.')[0] print(file_idx) #이게 파일이름이지뭐... idx_list.append(file_idx) #*json파일을 읽었는데, 읽은 순서를 저장하고 싶으면 이렇게 하면됨 with open(fname, encoding='UTF8') as json_file: json_data = json.load(json_file) #key가 app_detail 인 문자열 가지고 오기 json_string = json_data["app_detail"] # 개별문자 # corpus.append(json_string)#=>이걸 사용해서 corpus를 만들면 명사만 엮인게 아니라서 이상한 애들이 많이 들어감 #corpus에 명사만 담기 위함(형태소 자르기) nouns = h.nouns(json_string) #list를 string으로 변환 nouns = ' '.join(nouns) #list의 element들을 공백을 이용해서 구분함 # print("nouns 출력") # print(nouns) corpus.append(nouns) #corpus에 명사만 담을 것임! ###### app_category = json_data["app_category"] ### print(json_string) return full_data, app_category, corpus
def preTexts(comments, videoId): hannanum = Hannanum() main = pd.Series() for text in comments: text = processText(text) text_list = hannanum.nouns(text) main = main.append(pd.Series(text_list)) dir = os.path.dirname(os.path.abspath(__file__)) main.to_csv(f'{dir}/data/{videoId}.csv') result = main.value_counts().to_list() idx_result = main.value_counts().index.to_list() loop = 30 if (pd.DataFrame(result).shape[0] < loop): loop = result.shape[0] ret = [] for i in range(loop): ret.append([idx_result[i], result[i]]) #result.to_csv(f'{dir}/data/comm{videoId}.csv') return ret
def get_tags(self,text, ntags=10, multiplier=10): h = Hannanum() nouns = h.nouns(text) count = Counter(nouns) return [{'tag': n, 'size': c*multiplier }\ for n, c in count.most_common(ntags)]
def get_tags(text, ntags=50, multiplier=10): h = Hannanum() nouns = h.nouns(text) count = Counter(nouns) return [{"color": color(), "tag": n, "size": c * multiplier} for n, c in count.most_common(ntags)]
def WordCount(corpus): h = Hannanum() nouns = h.nouns(corpus) frequency = Counter(nouns) return frequency
def get_tags(text, ntags=int(sys.argv[2]), multiplier=10): h = Hannanum() nouns = h.nouns(text) count = Counter(nouns) return [{ 'color': color(), 'tag': n, 'size': c*multiplier }\ for n, c in count.most_common(ntags)]
class AnalysisDiction: """ This class is for analysis of korean texts using kkma and twitter dictionaries """ def __init__(self, on_han=False, on_twitter=False, on_mecab=False): # maybe move to init of analysis_app """ Allocate kkma or twitter diction instance :param on_han: han instance :param on_twitter: twitter instance :param on_mecab: mecab instance """ if on_han is True: self.han = Hannanum() if on_twitter is True: self.twitter = Twitter() # if on_mecab is True: # self.mecab = Mecab() def analyzer_hannaum(self, string_data, mode): """ This method is for hannanum. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._hannanum """ if mode is 'morphs': return self.han.morphs(string_data) elif mode is 'nouns': return self.han.nouns(string_data) elif mode is 'pos': return self.han.pos(string_data) else: return False def analyzer_mecab(self, string_data, mode): """ This method is for mecab. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#mecab-class """ if mode is 'morphs': return self.mecab.morphs(string_data) elif mode is 'nouns': return self.mecab.nouns(string_data) elif mode is 'pos': return self.mecab.pos(string_data) else: return False def analyzer_twitter(self, string_data, mode): """ This method is for twitter. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._twitter """ if mode is 'morphs': return self.twitter.morphs(string_data) elif mode is 'nouns': return self.twitter.nouns(string_data) elif mode is 'pos': return self.twitter.pos(string_data) elif mode is 'posmore': return self.twitter.pos(string_data, True, True) else: return False