Exemplo n.º 1
0
def tag_article():
    """
    주기 : 하루
    작업 : 게시물의 명사를 기준으로 테그한다.
    """
    from konlpy.tag import Hannanum
    hannanum = Hannanum()

    for article in Article.objects.all():
        try:
            tags = [
                tag for tag in hannanum.nouns(article.title) if len(tag) > 1
            ]

            for tag in tags[:10]:
                splits = re.split(r',', tag)

                tags.remove(tag)

                if len(splits) > 1:
                    for split in splits:
                        tags.append(split.strip(SPLIT_CHAR))
                else:
                    tags.append(tag.strip(SPLIT_CHAR))

            article.do_tag(tags)

        except Exception as e:
            print(hannanum.nouns(article.title))
            print(e)
Exemplo n.º 2
0
    def max_similarity(self):
        konlpy = Hannanum()
        l = konlpy.nouns(self.lyrics_input)
        song_list = self.song_list
        song_id = 0
        max_similarity = 0.0

        result = self.compare_lyrics()
        if result > 0:
            return result

        print("입력된 가사의 단어 배열: ", l)
        for song in song_list:
            if song['words'] is None:
                song['words'] = konlpy.nouns(song['lyrics'])

            print("song_id, title: ", song['song_id'], song['title'])
            temp = self.measure_similarity(l, song['words'])
            print("코사인 유사도: ", temp)
            print()
            if temp > max_similarity:
                song_id = song['song_id']
                # title 출력을 원한다면 주석해제
                # title = song['title']
                max_similarity = temp

        # title 출력을 원한다면 주석해제
        return song_id  # , title
Exemplo n.º 3
0
def lineAnalyzer(sentence, analyzeType):
    hannanum = Hannanum()
    wordList = list()
    if (analyzeType == 1):
        # Nouns
        wordList = hannanum.nouns(str(sentence))
    elif (analyzeType == 2):
        # Morphs
        wordList = hannanum.morphs(str(sentence))
    elif (analyzeType == 3):
        # Bi-grams
        bigram_measures = collocations.BigramAssocMeasures()
        pos = hannanum.pos(str(sentence))
        words = [s for s, t in pos]
        finder = collocations.BigramCollocationFinder.from_words(words)
        finder.apply_word_filter(lambda w: len(w) < 2)
        finder.apply_freq_filter(3)
        wordList = finder.nbest(bigram_measures.pmi, 10)
    elif (analyzeType == 4):
        # Tri-grams
        trigram_measures = collocations.TrigramAssocMeasures()
        pos = hannanum.pos(str(sentence))
        words = [s for s, t in pos]
        finder = collocations.TrigramCollocationFinder.from_words(words)
        finder.apply_word_filter(lambda w: len(w) < 2)
        finder.apply_freq_filter(3)
        wordList = finder.nbest(trigram_measures.pmi, 10)
    else:
        print("error on top!")
    return wordList
Exemplo n.º 4
0
def generate_wordCloud(text, font_path, extractNum = 15):
    hannanum = Hannanum()
    setFont(font_path)

    ## mask image
    image_mask = np.array(Image.open("./utils/visualize/만세_보노.jpg"))

    cleanText = clean_text(text)
    words = hannanum.nouns(cleanText)
    word_list = flatten(words)
    word_list = pd.Series([x for x in word_list if len(x)>1]) #; print( word_list.value_counts().head(20) )
    stopwordList = ['’','”','‘','·','…','"',"'"]
    wordcloud = WordCloud(font_path=font_path
                        , stopwords=stopwordList
                        , width=800, height=800
                        , mask=image_mask
                        , background_color='white')

    count = Counter(word_list)
    wordcloud = wordcloud.generate_from_frequencies(count)
    array = wordcloud.to_array()

    fig = plt.figure(figsize=(10,10))
    plt.imshow(array, interpolation='bilinear')
    plt.axis("off")
    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    string = b64encode(buf.read())
    wcURI = 'data:image/png;base64,' + urllib.parse.quote(string)
    count = count.most_common(extractNum)
    barURI = generate_barchart(count)
    return wcURI, barURI, count
    def generate_summary(self, file_name, index, top_n=5):
        stop_words = read_data(filename='korean_stopwords_list.txt')

        summarize_text = []

        # Step 1 - Read text anc split it
        sentences = self.read_article(file_name, index)

        #token화 추가
        hannanum = Hannanum()
        temp = []
        for sentence in sentences:
            temp.append(hannanum.nouns(' '.join(sentence)))
        # print("temp:",temp)

        # Step 2 - Generate Similary Martix across sentences
        sentence_similarity_martix = self.build_similarity_matrix(
            temp, stop_words)

        # Step 3 - Rank sentences in similarity martix
        sentence_similarity_graph = nx.from_numpy_array(
            sentence_similarity_martix)
        scores = nx.pagerank(sentence_similarity_graph)

        # Step 4 - Sort the rank and pick top sentences
        ranked_sentence = sorted(
            ((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
        for i in range(top_n):
            summarize_text.append(" ".join(ranked_sentence[i][1]))

        # Step 5 - Offcourse, output the summarize text
        print("\nSummarize Text: \n", ". ".join(summarize_text))
Exemplo n.º 6
0
def get_string(path):
    f = open(path, "r", encoding="utf-8")
    sample = f.read()
    f.close()
    h = Hannanum()
    list_nouns = h.nouns(sample) #get list of nouns from sample
    return listToString(list_nouns) #get string of list_nouns
Exemplo n.º 7
0
def get_tags(text, ntags=50, multiplier=10):
    h = Hannanum()
    nouns = h.nouns(text)
    count = Counter(nouns)
    print(count)
    return [{'color': color(), 'tag': n, 'size': c * multiplier} \
            for n, c in count.most_common(ntags)]
Exemplo n.º 8
0
def test():
    setFont()
    hannanum = Hannanum()
    #DB Connecion    
    # conn = oci.connect("test/[email protected]:32764/xe", charset='utf8')
    conn = oci.connect('test','1234','192.168.0.52:32764/xe', encoding='utf-8')
    df = pd.read_sql('select * from article_sample', conn )
    sample1 = df['ARTICLE_CONTENT'][0].read()
    word = hannanum.nouns(sample1)
    word_list = flatten(word)
    word_list = pd.Series([x for x in word_list if len(x)>1])
    print( word_list.value_counts().head(20) )
    stopwordList = ''
    wordcloud = WordCloud(font_path=setFontPath()
                        , stopwords=stopwordList
                        , width=800, height=800
                        , background_color='white')

    count = Counter(word_list)
    wordcloud = wordcloud.generate_from_frequencies(count)
    array = wordcloud.to_array()

    fig = plt.figure(figsize=(10,10))
    plt.imshow(array, interpolation='bilinear')
    plt.axis("off")
    plt.show()
    # plt.savefig('C:/Users/admin/Documents/IMG04.png', bbox_inches='tight')
Exemplo n.º 9
0
def text_preprocessing_after(lists):
    hannanum = Hannanum()
    getNum = 5
    stopword = ['등', '코', '만', '속보', '최초', '4억', '월요일']
    cleaning = lambda x: hannanum.nouns(wordcloud01.clean_text(x))
    nouns_list = list(map(cleaning, lists))

    # print(nouns_list)

    texts = [value for nouns in nouns_list for value in nouns]
    total_counter = Counter(texts)
    for word in stopword:
        del total_counter[word]
    result = total_counter.most_common(getNum)
    return result


## 명사 빈도 추출. ##################################################
# def nouns_frequency(text):
#     print('Kkma 객체 생성')
#     hannanum = Kkma()
#     print('텍스트 처리중')
#     clean_text = wordcloud01.clean_text(text)
#     print('텍스트 명사 처리중')
#     words = hannanum.nouns(clean_text)
#     print('평평하게 만들기')
#     word_list = wordcloud01.flatten(words)
#     print('판다스 변환중')
#     word_list = pd.Series([x for x in word_list if len(x)>1])
#     print('result Counter 중')
#     result = Counter(word_list)
#     return result
Exemplo n.º 10
0
def test():
    rss_list = [
        # "https://www.reddit.com/",
        "http://www.chosun.com/site/data/rss/politics.xml",
        "http://rss.joins.com/joins_politics_list.xml",
    ]

    hannanum = Hannanum()
    # mecab = Macab()

    for rss_link in rss_list:
        print("Start get_URLs and read files from : " + rss_link)
        start_time = time.time()
        links = get_URLs(rss_link)
        for link in links:
            parse_time = time.time()
            article = get_article(link)
            file = open("./test/%s.txt" % (article.title),
                        'w',
                        encoding="utf8")
            nouns = hannanum.nouns(article.text)
            # nouns = mecab.nouns(article.text)

            for noun in nouns:
                file.write("%s\n" % noun)
            file.close()
            parse_time = time.time() - parse_time
            print("parse files from %s: %f" % (link, parse_time))
        start_time = time.time() - start_time
        print("Process time : %f" % (start_time))
def parse(df, _type: str):
    """Parse function"""

    # Parser
    korean_parser = Hannanum()

    neg = df[df['label'] == 0]['document'].tolist()
    pos = df[df['label'] == 1]['document'].tolist()

    nouns_doc_f = open('./input/nouns_{}_documents.txt'.format(_type), 'w')
    nouns_label_f = open('./input/nouns_{}_labels.txt'.format(_type), 'w')
    morphs_doc_f = open('./input/morphs_{}_documents.txt'.format(_type), 'w')
    morphs_label_f = open('./input/morphs_{}_labels.txt'.format(_type), 'w')

    logger.info("Starting parsing...")
    for doc in neg:
        try:
            nouns_doc_f.write(','.join(korean_parser.nouns(doc)) + '\n')
            nouns_label_f.write('{}\n'.format(0))
        except:
            pass
        try:
            morphs_doc_f.write(','.join(korean_parser.morphs(doc)) + '\n')
            morphs_label_f.write('{}\n'.format(0))
        except:
            pass

    logger.info('%s pos document parsing completed.' % _type)

    for doc in pos:
        try:
            nouns_doc_f.write(','.join(korean_parser.nouns(doc)) + '\n')
            nouns_label_f.write('{}\n'.format(1))
        except:
            pass
        try:
            morphs_doc_f.write(','.join(korean_parser.morphs(doc)) + '\n')
            morphs_label_f.write('{}\n'.format(1))
        except:
            pass

    logger.info('%s neg document parsing completed.' % _type)

    nouns_doc_f.close()
    nouns_label_f.close()
    morphs_doc_f.close()
    morphs_label_f.close()
Exemplo n.º 12
0
def get_tags(text, ntags=50, multiplier=10):
	h = Hannanum()
	nouns = h.nouns(text)
	count = Counter(nouns)

	# for word,cnt in count.most_common(ntags):
	#	print(word,cnt)
	return count
def review_preprocessing(data):
    # Hannanum package
    pos_tagger = Hannanum()

    # 뉴스를 tokenizing한 후, 명사만 추출
    pos_nouns = pos_tagger.nouns(data)

    return ' '.join(pos_nouns)
Exemplo n.º 14
0
def text_mining(title_list, ntags=50, multiplier=1):
    h = Hannanum()
    data_nouns = []
    for title in title_list:
        data_nouns.extend(h.nouns(title))
    
    count = Counter(data_nouns)
    
    return [{'color': color(),'tag':n,'size':int(c*multiplier*0.5)} for n,c in count.most_common(ntags)]
Exemplo n.º 15
0
def comment_freq(youtube_data):
    # youtuber_csv_data = dm.GetData(url, con)
    # if youtuber_csv_data == None:
    #     print("데이터 없음")
    #     return None
    # video_num = int(input("몇 번 동영상을 분석할까요 ? "))
    # youtube_data = dm.GetData(youtuber_csv_data[video_num][0], password) >> main.py에서 구현
    if youtube_data == None:
        return None
    comment = []
    for i in range(len(youtube_data)):
        comment.append(youtube_data[i][2])

    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "]+",
        flags=re.UNICODE)
    han = re.compile(r'[ㄱ-ㅎㅏ-ㅣ!?~,".\n\r#\ufeff\u200d]')

    comment_noemot = []
    for i in comment:
        tokens = re.sub(emoji_pattern, "", i)
        tokens = re.sub(han, "", tokens)
        comment_noemot.append(tokens)

    nouns = []
    h = Hannanum()

    for i in comment_noemot:
        n = h.nouns(i)
        nouns.append(n)

    noun_list = []
    for i in range(len(nouns)):
        for j in range(len(nouns[i])):
            noun_list.append(nouns[i][j])

    counts = Counter(noun_list)
    tags = counts.most_common(30)

    wc = WordCloud(font_path='C:\\Windows\\Fonts\\gulim.ttc',
                   background_color='black',
                   width=800,
                   height=600)

    cloud = wc.generate_from_frequencies(dict(tags))
    cloud
    plt.figure(figsize=(10, 8))
    plt.axis('off')
    plt.imshow(cloud)
    plt.show()
def tokenization(cleaned_docs):
    han = Hannanum()
    tokenized_docs = []
    while ' ' in cleaned_docs:
        cleaned_docs.remove(' ')
    for doc in cleaned_docs:
        nouns_in_doc = []
        for noun in han.nouns(doc):
            if len(noun) > 1: nouns_in_doc.append(noun)
        tokenized_docs.append(nouns_in_doc)
    return tokenized_docs
Exemplo n.º 17
0
def update_words_all():
    hannanum = Hannanum()
    db = db_connector.DbConnector()
    song_list = db.select_all()
    for song in song_list:
        if song['lyrics'] is not None and song['words'] is None:
            words = hannanum.nouns(song['lyrics'])
            words = sorted(set(words))
            update_words(song['song_id'], ' '.join(words))

    print('Words extraction done!')
Exemplo n.º 18
0
    def wordAnalysis(text):
        myHannanum = Hannanum()

        print("text : " + text)

        replace_text = re.sub("[!@#$%^&*()_+]", " ", text)

        print("replace_text : " + replace_text)

        analysis_text = (" ".join(myHannanum.nouns(replace_text)))

        return analysis_text
Exemplo n.º 19
0
 def reduceToWords(self):
     hannanum = Hannanum()
     words = ''
     #for word in hannanum.nouns(unicode(texts, 'UTF-8')):
     if (self.result != ''):
         for word in hannanum.nouns(self.result):
             word = re.sub("[(*&]", "", word)
             if (len(word) > 1): words = word + '\n' + words
         #for end
         self.result = words
         print words
     # if end
     return self
Exemplo n.º 20
0
    def extract(self):  # extract nouns with Komoran
        hnn = Hannanum()
        # merge = str(self.merge_sentence.encode('utf-8'), encoding='utf-8') # 인코딩 문제 해결 못함 *
        merge = self.merge_sentence
        nouns = hnn.nouns(merge)

        for n in nouns:  # 특수문자 제거
            n = n.replace("'", "").replace(",", "")

        processed = [n for n in nouns if len(n) >= 2]  # min length 2
        count = Counter(processed)

        self.tags = count.most_common(20)  # max character 20
Exemplo n.º 21
0
 def reduceToWords(self) :
   hannanum = Hannanum()
   words = ''
   #for word in hannanum.nouns(unicode(texts, 'UTF-8')):
   if(self.result != '') :
     for word in hannanum.nouns(self.result):
       word = re.sub("[(*&]", "", word)
       if(len(word) > 1): words = word + '\n' + words
     #for end
     self.result =  words
     print words
   # if end
   return self
Exemplo n.º 22
0
def crawl():
    global hannanum
    if hannanum == None:
        hannanum = Hannanum()
    if jpype.isJVMStarted():
        jpype.attachThreadToJVM()
    hannanum = Hannanum()
    media = Media.objects.all()
    articles = Article.objects.all()
    count = 0
    all = 0
    for medium in media:
        links = get_URLs(medium.rss_list)
        #print(links)
        upper_bound = len(links)

        all += upper_bound

        for link in links:
            #print(link)
            if Article.objects.filter(article_url=link).exists():
                continue
            try:
                article = get_article(link)
            except:
                print("Fail:%s" % link)
                continue
            #print(link)
            title = article.title
            content = article.text
            nouns = hannanum.nouns(article.text)
            morphemed_content = " ".join(nouns)
            writer = ''
            if len(article.authors) == 0:
                writer = 'anonymous'
            else:
                writer = article.authors[0]
            try:
                articles.create(
                    title=title,
                    content=content,
                    morphemed_content=morphemed_content,
                    media=medium,
                    writer=writer,
                    article_url=link,
                )
                count += 1
            except:
                print("Fils:%s,title:%s" % (link, title))
                continue
    return (count, all)
Exemplo n.º 23
0
def insert_summary():
    mongoDB = myMongoDB("CapstoneTest")
    #fasttext.util.download_model('ko', if_exists='ignore')
    ft = fasttext.load_model('./models/cc.ko.300.bin')

    total_clean_sentence = []
    string_id = []

    for content in mongoDB.collected.find({}, {"_id": 1, "content": 1}):
        cleaned_sentence = []
        clean_sentence = []
        string_id.append(list(content.values())[0])
        string = list(content.values())[1]
        string = string.replace(u'\xa0', u' ')
        string = string.replace(u'\n', u' ')
        string = string.replace(u'\r', u' ')
        clean_sentence.append(sent_tokenize(string))
        for i in clean_sentence:
            for j in i:
                cleaned_sentence.append(j)
            total_clean_sentence.append(cleaned_sentence)

    temp = []
    hannanum = Hannanum()
    for clean_sentence in total_clean_sentence:
        for s in clean_sentence:
            noun = hannanum.nouns(s)
            for i in noun:
                temp.append(i)

    for i in temp:
        word_vector_arr = np.asarray(ft[i], dtype='float32')
        word_dict[i] = word_vector_arr

    string_idx = 0

    for clean_sentence in total_clean_sentence:
        article_embedding = articles_to_vectors(clean_sentence)
        similar_matrix = similarity_matrix(article_embedding)
        score = calculate_score(similar_matrix)
        summaryShort_list = summaryShort(clean_sentence, score)
        summaryMed_list = summaryMed(clean_sentence, score)
        summaryLong_list = summaryLong(clean_sentence, score)
        mongoDB.collected.update_one({'_id': string_id[string_idx]}, {
            '$set': {
                'sum_short': summaryShort_list,
                'sum_mid': summaryMed_list,
                'sum_long': summaryLong_list
            }
        })
        string_idx += 1
Exemplo n.º 24
0
    def post(
        self,
        request,
        format=None
    ):  #JSON: "key" : "value" --> "searchWord" : "보온보냉팩 버리는 방법 좀 알려줘?"

        searchSentence = request.data['searchWord']  #안드로이드에서 searchWord 입력해야함
        if "캔" in searchSentence:
            print("Okt")
            okt = Okt()
            Nouns = okt.nouns(searchSentence)
        else:
            print("Hannanum")
            ha = Hannanum()
            Nouns = ha.nouns(searchSentence)
        print('nouns: ', Nouns)
        Idx = []
        temp = []
        small_list = list()
        for word in Nouns:
            smallIdx = WasteCategoryS.objects.filter(cg_name__contains=word)
            for val in smallIdx:
                small_list.append(val)

            if len(smallIdx) == 0:
                print('len 0')
                middleIdx = WasteCategoryM.objects.filter(
                    cg_name__contains=word)
                for ob in middleIdx:
                    Idx.append(ob.idx)
                continue

            for ob in smallIdx:
                Idx.append(ob.cg_middle_idx.idx)

        print(Idx)
        dischargeTipsList = []
        for idx in Idx:
            dischargeTipsList.append(
                DischargeTips.objects.get(category_m_idx=idx))
        serializer = DischargeTipsSerializer(dischargeTipsList, many=True)

        waste_serializer = WasteCategorySSerializer(small_list, many=True)

        return Response(
            {
                "matching_name": waste_serializer.data,
                "textVoiceDischargeTips": serializer.data
            },
            status=status.HTTP_201_CREATED)
Exemplo n.º 25
0
def main(args=None):

    print("Loading pdf files...")

    outfp = extract_text(files=search("./sample_pdf"), outfile="temp.txt")
    outfp.close()

    print("Loading data...")
    f = open("temp.txt", encoding='UTF8')
    elements = f.readlines()
    elements = [x for x in elements if x != "\n"]
    elements = [x.rstrip() for x in elements]

    hannanum = Hannanum()
    korean_list = []
    korean_noun_list = []
    english_list = []

    korean = re.compile('[^ ㄱ-ㅣ가-힣]+')

    for element in elements:
        korean_list.append(korean.sub("", element))

    korean_list = [x.strip() for x in korean_list]
    korean_list = [x for x in korean_list if x != '']

    print("Parsing Korean words...")
    for korean in korean_list:
        korean_noun_list += hannanum.nouns(korean)

    korean_noun_list = [x for x in korean_noun_list if len(x) > 1]
    print("Korean list : ", korean_noun_list)

    print("Parsing English words...")
    for element in elements:
        english_list.append(re.sub('[^a-zA-Z]', '', element))

    english_list = [x.strip() for x in english_list]
    english_list = [x for x in english_list if x != '']
    print("English list : ", english_list)

    korean_counter = collections.Counter(korean_noun_list)
    print("Most 10 word in Korean words:", korean_counter.most_common(10))

    english_counter = collections.Counter(english_list)
    print("Most 10 word in English words:", english_counter.most_common(10))

    draw_word_cloud(korean_noun_list, "Korean")
    draw_word_cloud(english_list, "English")
    print("Done")
Exemplo n.º 26
0
def draw_cloud(reviews):
    tags = {}
    # r = lambda: random.randint(0,255)
    # color = lambda: (r(), r(), r())
    for review in reviews:
        h = Hannanum()
        nouns = h.nouns(review)
        count = dict(Counter(nouns))
        tags = {
            k: tags.get(k, 0) + count.get(k, 0)
            for k in set(tags) | set(count)
        }
    gen_stylecloud(text=tags,
                   output_name="wordcloud.png",
                   icon_name="fas fa-square-full",
                   background_color="white",
                   font_path="Jua-Regular.ttf",
                   size=1024)
Exemplo n.º 27
0
def text_preprocessing(queryset):
    # print(queryset,len(queryset))
    hannanum = Hannanum()
    getNum = 5
    stopword = ['등', '코', '만', '속보', '최초', '4억', '월요일']
    df = pd.DataFrame.from_records(queryset)
    # print(df, type(df))
    # df['title_nouns'] = df['article_title'].apply( lambda x : hannanum.nouns( wordcloud01.clean_text( x ) ) ); print(df['title_nouns']); print(df['title_nouns'].sum())
    # print('apply시작')
    df['title_nouns'] = df['article_title'].apply(
        lambda x: Counter(hannanum.nouns(wordcloud01.clean_text(x))))
    # print('sum시작')
    total_counter = df['title_nouns'].sum()
    # print('stopword')
    for word in stopword:
        del total_counter[word]
    # print(type(total_counter), total_counter.most_common( getNum ))
    result = total_counter.most_common(getNum)
    return result
Exemplo n.º 28
0
def rec_hashtag(request):
    """
        해시태그 추천 시 사용할 API

        ---
    """
    content = request.data.get('article')
    hannanum = Hannanum()
    keywords = hannanum.nouns(content)
    count = Counter(keywords)
    count = sorted(sorted(count.items(), key=operator.itemgetter(0)),
                   key=lambda x: x[1],
                   reverse=True)
    data = []
    for k in count:
        data.append(k[0])
        if len(data) > 2:
            break
    return Response(data)
Exemplo n.º 29
0
    def morp_analysis(self):
        noun_text = re.sub('[-_=+,#/\?:^$.@*\"※~&%ㆍ·!』\\‘’|\(\)\[\]\<\>`\'…》]', '', str(self))

        noun_text = re.sub('\n', '', noun_text)

        hannanum = Hannanum()
        text_list = hannanum.nouns(noun_text)  # 명사 분석

        word_list = pd.Series(text_list)
        result = word_list.value_counts().head(10)
        result_values = list(result.values)

        for i in range(len(result_values)):
            result_values[i] = np.int16(result_values[i]).item()

        freq_lst = []
        for i in range(len(result)):
            freq_lst.append({'word': result.keys()[i], 'freq': result_values[i]})

        return freq_lst
Exemplo n.º 30
0
def get_tags(text, ntags=50, multiplier=2):
    h = Hannanum()
    nouns = h.nouns(text)
    long_nouns = list()
    for n in nouns:
        if len(n) >= 2:  # 길이가 2 이상인 명사만 출력
            long_nouns.append(n)

    count = Counter(long_nouns)
    word_list = list()
    for w in count.most_common(ntags):
        if w[1] >= 10:  # 10번 이상 나온 명사만 출력
            print(w)
            word_list.append(w)

    r = lambda: random.randint(0, 255)
    color = lambda: (r(), r(), r())

    return [{'color': color(), 'tag': n, 'size': c*multiplier}\
                for n, c in word_list]
Exemplo n.º 31
0
def wordcloud_textmining(text):

    H = Hannanum()
    twttierMS = np.array(Image.open('./static/images/동그라미.png'))

    a = " ".join(H.nouns(text))
    wc = WordCloud(font_path="./font/BMEULJIROTTF.ttf",
                   background_color="white",
                   width=1000,
                   height=1000,
                   mask=twttierMS,
                   max_words=150,
                   max_font_size=200)

    wc.generate(a)

    fig = plt.figure()
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')

    plt.savefig('./static/Wordcloud.png')
    return fig
Exemplo n.º 32
0
def get_text():
    full_data = ""

    #폴더내의 모든 파일 가지고 오기
    files = glob.glob(file_dir + '/*.json')
    corpus = list()
    h = Hannanum()

    for fname in files:
        # print("fname")
        print(fname)
        file_name = fname.split('\\')[1]
        file_idx = file_name.split('.')[0]
        print(file_idx)  #이게 파일이름이지뭐...
        idx_list.append(file_idx)  #*json파일을 읽었는데, 읽은 순서를 저장하고 싶으면 이렇게 하면됨

        with open(fname, encoding='UTF8') as json_file:
            json_data = json.load(json_file)
            #key가 app_detail 인 문자열 가지고 오기

            json_string = json_data["app_detail"]  # 개별문자
            # corpus.append(json_string)#=>이걸 사용해서 corpus를 만들면 명사만 엮인게 아니라서 이상한 애들이 많이 들어감

            #corpus에 명사만 담기 위함(형태소 자르기)

            nouns = h.nouns(json_string)

            #list를 string으로 변환
            nouns = ' '.join(nouns)  #list의 element들을 공백을 이용해서 구분함

            # print("nouns 출력")
            # print(nouns)
            corpus.append(nouns)  #corpus에 명사만 담을 것임!
            ######

            app_category = json_data["app_category"]
            ### print(json_string)

    return full_data, app_category, corpus
Exemplo n.º 33
0
def preTexts(comments, videoId):
    hannanum = Hannanum()
    main = pd.Series()
    for text in comments:
        text = processText(text)
        text_list = hannanum.nouns(text)
        main = main.append(pd.Series(text_list))

    dir = os.path.dirname(os.path.abspath(__file__))
    main.to_csv(f'{dir}/data/{videoId}.csv')
    result = main.value_counts().to_list()
    idx_result = main.value_counts().index.to_list()
    loop = 30
    if (pd.DataFrame(result).shape[0] < loop):
        loop = result.shape[0]

    ret = []
    for i in range(loop):
        ret.append([idx_result[i], result[i]])

    #result.to_csv(f'{dir}/data/comm{videoId}.csv')
    return ret
Exemplo n.º 34
0
 def get_tags(self,text, ntags=10, multiplier=10):
     h = Hannanum()
     nouns = h.nouns(text)
     count = Counter(nouns)
     return [{'tag': n, 'size': c*multiplier }\
                 for n, c in count.most_common(ntags)]
Exemplo n.º 35
0
def get_tags(text, ntags=50, multiplier=10):
    h = Hannanum()
    nouns = h.nouns(text)
    count = Counter(nouns)
    return [{"color": color(), "tag": n, "size": c * multiplier} for n, c in count.most_common(ntags)]
Exemplo n.º 36
0
def WordCount(corpus):
    h = Hannanum()
    nouns = h.nouns(corpus)
    frequency = Counter(nouns)
    return frequency
Exemplo n.º 37
0
def get_tags(text, ntags=int(sys.argv[2]), multiplier=10):
    h = Hannanum()
    nouns = h.nouns(text)
    count = Counter(nouns)
    return [{ 'color': color(), 'tag': n, 'size': c*multiplier }\
                for n, c in count.most_common(ntags)]
Exemplo n.º 38
0
class AnalysisDiction:
    """
    This class is for analysis of korean texts using kkma and twitter dictionaries
    """
    def __init__(self, on_han=False, on_twitter=False, on_mecab=False):    # maybe move to init of analysis_app

        """
        Allocate kkma or twitter diction instance
        :param on_han: han instance
        :param on_twitter: twitter instance
        :param on_mecab: mecab instance
        """
        if on_han is True:
            self.han = Hannanum()
        if on_twitter is True:
            self.twitter = Twitter()
        # if on_mecab is True:
        #     self.mecab = Mecab()

    def analyzer_hannaum(self, string_data, mode):
        """
        This method is for hannanum. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._hannanum
        """
        if mode is 'morphs':
            return self.han.morphs(string_data)
        elif mode is 'nouns':
            return self.han.nouns(string_data)
        elif mode is 'pos':
            return self.han.pos(string_data)
        else:
            return False

    def analyzer_mecab(self, string_data, mode):
        """
        This method is for mecab. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#mecab-class
        """
        if mode is 'morphs':
            return self.mecab.morphs(string_data)
        elif mode is 'nouns':
            return self.mecab.nouns(string_data)
        elif mode is 'pos':
            return self.mecab.pos(string_data)
        else:
            return False

    def analyzer_twitter(self, string_data, mode):
        """
        This method is for twitter. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._twitter
        """
        if mode is 'morphs':
            return self.twitter.morphs(string_data)
        elif mode is 'nouns':
            return self.twitter.nouns(string_data)
        elif mode is 'pos':
            return self.twitter.pos(string_data)
        elif mode is 'posmore':
            return self.twitter.pos(string_data, True, True)
        else:
            return False