示例#1
0
class AnalysisDiction:
    """
    This class is for analysis of korean texts using kkma and twitter dictionaries
    """
    def __init__(self, on_kkma=False, on_twitter=False):    # maybe move to init of analysis_app
        """
        Allocate kkma or twitter diction instance
        :param on_kkma: kkma instance
        :param on_twitter: twitter instance
        """
        if on_kkma is True:
            self.kkma = Kkma()
        if on_twitter is True:
            self.twitter = Twitter()

    def analyzer_kkma(self, string_data, mode):
        """
        This method is for kkma. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._kkma
        """
        if mode is 'morphs':
            return self.kkma.morphs(string_data)
        elif mode is 'nouns':
            return self.kkma.nouns(string_data)
        elif mode is 'pos':
            return self.kkma.pos(string_data)
        else:
            return False

    def analyzer_twitter(self, string_data, mode):
        """
        This method is for twitter. It acts differently depends on its mode.
        :param string_data: String data for analysis
        :param mode: Analyze string data depending on its mode
        :return: Return its results. If have no mode in param , return false
        ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._twitter
        """
        if mode is 'morphs':
            return self.twitter.morphs(string_data)
        elif mode is 'nouns':
            return self.twitter.nouns(string_data)
        elif mode is 'pos':
            return self.twitter.pos(string_data)
        elif mode is 'posmore':
            return self.twitter.pos(string_data, True, True)
        else:
            return False
示例#2
0
	def get_noun(self):
		print("[*] 명사 추출 시작")
		start_time = time.time()
		twitter = Twitter()
		for s in self.word_list:
			temp = twitter.nouns(s)
			for t in temp:
				self.noun_list.append(str(t))

		end_time = time.time()
		print("[*] 명사 추출 완료(소요시간 : {0})".format(str((end_time-start_time))))
		print("[*] 추출된 명사 길이 : {0}".format(str(len(self.noun_list))))

		# 빈도 분석
		count = Counter(self.noun_list)
		#tag = count.most_common( int(len(count)*(15/100)) )
		tag = count.most_common(50)
		taglist = pytagcloud.make_tags(tag, maxsize=100)
		pytagcloud.create_tag_image(taglist, 'wordcloud.jpg', size=(800, 600), fontname='Nanum Gothic Coding', rectangular=False)
示例#3
0
def keywords_function(result):
    # 키워드 추출 : https://dalulu.tistory.com/108
    try:
        nlpy = Twitter()
        nouns = nlpy.nouns(result)
        count = Counter(nouns)

        tag_count = []
        tags = []

        for n, c in count.most_common(200):
            dics = {'tag': n, 'count': c}
            if len(dics['tag']) >= 2 and len(tags) <= 49:
                tag_count.append(dics)
                tags.append((dics['tag'], dics['count']))

        return tags
    except:
        return []
def get_tags(text, max_count, min_length):
    t = Twitter()

    if max_count == None or min_length == None:
        max_count = 20
        min_length = 1
        
    nouns = t.nouns(text)
    processed = [noun for noun in nouns if len(noun) >= min_length]
    count = Counter(processed)

    result = {}
    for n, c in count.most_common(max_count):
        result[n] = c

    if len(result) == 0:
        result["내용이 없습니다."] = 1

    return result
示例#5
0
def get_tags(text, max_count, min_length):
    t = Twitter()

    # 한글 명사만 추출함.
    nouns = t.nouns(text)

    # 데이터를 전처리함.
    processed = [n for n in nouns if len(n) >= min_length]
    count = Counter(processed)
    result = {}

    # 출현 빈도가 높은 단어의 수를 max_count의 수 만큼 추출
    for n, c in count.most_common(max_count):
        result[n] = c

    # 만약 추출된 단어가 1개도 존재하지 않을 경우
    if len(result) == 0:
        result["내용이 없습니다."] = 1

    return result
示例#6
0
def make_korean():
    global background, stopwords
    text = open(name, encoding='utf-8').read()
    t = Twitter()
    token = t.nouns(text)
    count_voca = nltk.Text(token, name="단어카운팅")
    count_voca.vocab()
    voca = count_voca.vocab().most_common(150)
    font = "/usr/share/fonts/NanumFont/NanumGothicBold.ttf"
    wordcloud = WordCloud(font_path=font,
                          max_words=2000,
                          relative_scaling=0.2,
                          background_color="white",
                          mask=background).generate_from_frequencies(
                              dict(voca))
    plt.figure(figsize=(12, 12))
    plt.imshow(wordcloud, interpolation='bilinear')
    # plt.imshow(wordcloud.recolor(color_func=grey_color, random_state=3), interpolation='bilinear')
    plt.axis('off')
    plt.show()
示例#7
0
def make_wordcloud(cmt_set):

    print("before nlp")
    nlp = Twitter()
    print("nlp = Twitter()")
    nouns = nlp.nouns(cmt_set)
    print("nouns = nlp.nouns(buff)")
    count = Counter(nouns)
    print("count = Counter(nouns)")
    word_cloud = WordCloud(
        font_path='./HoonWhitecatR.ttf',
        max_words=20,
        font_step=5,
        mode='RGBA',
        background_color=None).generate_from_frequencies(count)
    print(word_cloud.words_)
    del (
        nlp, nouns, count
    )  # to solve memory error, 'nlp' make error when it work more then one time
    return word_cloud  #need to make save img and return location
示例#8
0
def main():

    df = pd.read_csv(r'C:\Users\Hajin_2\python\python\BigData_prj\news.csv')

    titles = ""
    for i in df.title:
        titles = titles + i

    nlp = Twitter()
    nouns = nlp.nouns(titles)
    count = Counter(nouns)

    wordInfo = dict()
    for tags, counts in count.most_common(50):
        if (len(str(tags)) > 1):
            wordInfo[tags] = counts
            print("%s : %d" % (tags, counts))

    showGraph(wordInfo)
    wordCloud(wordInfo)
def lexranker(text,code,date):

    text = text.replace('\\n','.')
    text2 = re.sub('[^가-힝0-9a-zA-Z\\s\\.]', '', text)

    lexrank =LexRank()
    #print(text2)
    lexrank.summarize(text2)
    summaries = lexrank.probe(3)
    word = Twitter()
    out = []
    print(summaries)
    for summary in summaries:
        out += word.nouns(summary)

    word = list(set(out))
    share = Share(code)
    startprice = share.get_open()
    endprice = share.get_price()
    for part in word:
        save_record(part, code, startprice, endprice, date)
示例#10
0
def main():
    openFileName = 'KakaoTalk'
    cloudImagePath = openFileName + '.jpg'

    rfile = open('KakaoTalk.txt', 'rt', encoding='UTF8')
    message = rfile.read()

    # [CODE 4]
    nlp = Twitter()
    nouns = nlp.nouns(message)
    count = Counter(nouns)

    # [CODE 5]
    wordInfo = dict()
    for tags, counts in count.most_common(50):
        if (len(str(tags)) > 1):
            wordInfo[tags] = counts
            print("%s : %d" % (tags, counts))

    showGraph(wordInfo)
    saveWordCloud(wordInfo, cloudImagePath)
示例#11
0
    def get_tags(self, text, date, ntags=100):
        spliter = Twitter()  # konlpy의 Twitter객체
        frequency = {}
        nouns = spliter.nouns(text)  # nouns 함수를 통해서 text에서 명사만 분리/추출
        match_pattern = re.findall(r'\b[a-zA-Z]{3,15}\b', text)
        count = Counter(nouns)  # Counter객체를 생성하고 참조변수 nouns할당
        return_list = []  # 명사 빈도수 저장할 변수

        for word in match_pattern:
            count1 = frequency.get(word, 0)
            frequency[word] = count1 + 1

        frequency_list = frequency.keys()
        for words in frequency_list:
            temp_en = {'tag': words, 'date': date}
            return_list.append(temp_en)

        for n, c in count.most_common(ntags):
            temp_ko = {'tag': n, 'date': date}
            return_list.append(temp_ko)
        return return_list
示例#12
0
def get_tags(open_text_file):
    nlp = Twitter()
    nouns_list = []
    toekn_list = []
    i = 0
    for line in open_text_file:
        # for line in tqdm(open_text_file):
        print(line)
        text = line
        text = regex.sub(u"[\n]", " ", text)
        n = nlp.nouns(text)
        token = nlp.morphs(text)
        for value in n:
            nouns_list.append(value)
        for j in token:
            toekn_list.append(j)
        # if i == 400:
        #     break
        # else:
        #     i+=1
    return nouns_list, toekn_list
示例#13
0
class SentenceTokenizer(object):
    def __init__(self):
        self.kkma = Kkma()
        self.twitter = Twitter()
        self.stopwords = ['중인' ,'만큼', '마찬가지', '꼬집었', "연합뉴스", "데일리", "동아일보", "중앙일보", "조선일보", "기자"
                          ,"아", "휴", "아이구", "아이쿠", "아이고", "어", "나", "우리", "저희", "따라", "의해", "을", "를", "에", "의", "가",
                         "JTBC", ]
        
    # url 주소를 받아 기사내용을 추출  후 sentence를 return 
    def url2sentences(self, url):
        article = Article(url, language='ko')
        article.download()
        article.parse()
        sentences = self.kkma.sentences(article.text)
        
        for idx in range(0, len(sentences)):
            if len(sentences[idx]) <= 10:
                sentences[idx-1] += (' ' + sentences[idx])
                sentences[idx] = ''

        return sentences
    
    # text를 입력받아 setences를 return 
    def text2sentences(self, text):
        sentences = self.kkma.sentences(text)
        for idx in range(0, len(sentences)):
            if len(sentences[idx]) <= 10:
                sentences[idx-1] += (' ' + sentences[idx])
                sentences[idx] = ''
                
        return sentences
    
    # sentence를 받아 nouns를 return
    def get_nouns(self, sentences):
        nouns = []
        for sentence in sentences:
            if sentence is not '':
                nouns.append(' '.join([noun for noun in self.twitter.nouns(str(sentence))
            if noun not in self.stopwords and len(noun) > 1]))
        return nouns
示例#14
0
def main():
    # kkma = Kkma()
    # hannanum = Hannanum()
    # mecab = Mecab()
    twitter = Twitter()
    sentences = []
    words = []
    with open('result') as f:
        for line in f:
            sentences.append(line)

    for s in sentences:
        words.extend(twitter.nouns(s))
        # words.extend(mecab.nouns(s))
        # words.extend(kkma.nouns(s))
        # words.extend(hannanum.nouns(s))

    od = OrderedDict(Counter(words).most_common(30))
    for k, v in od.items():
        if k in IGNORE_WORDS:
            continue
        print(k, '\t', v)
示例#15
0
class MakeWordCloud:
    def __init__(self, data_list):
        self.category_list = data_list.keys()
        self.word_list = data_list
        self.nlp = Twitter()
        self.cnt = 0

    def make_Word_Cloud(self):
        for data in self.word_list:

            nouns = self.nlp.nouns(self.word_list.get(data))
            nouns = [x for x in nouns if len(x) != 1]
            count = Counter(nouns)
            tags = count.most_common(40)
            taglist = pytagcloud.make_tags(tags, minsize=15, maxsize=50)

            fileName = './img/wordcloud' + str(self.cnt) + '.jpg'
            pytagcloud.create_tag_image(taglist,
                                        fileName,
                                        size=(600, 600),
                                        fontname='korean',
                                        rectangular=True)
            self.cnt += 1
示例#16
0
def get_tags(text, ntags):
    spliter = Twitter()
    # konlpy의 Twitter객체
    nouns = spliter.nouns(text)
    # nouns 함수를 통해서 text에서 명사만 분리/추출
    count = Counter(nouns)
    # Counter객체를 생성하고 참조변수 nouns할당
    return_list = []  # 명사 빈도수 저장할 변수

    value_to_remove = ["곳","수","것","맛","때","정말","더","저","진짜","바로",
                       "이","여기","날","꼭","안","거","그","또","저희","정도",
                       "제","좀","타고","요","보고","그냥","중","때문","조금",
                       "아주","다른","듯","쪽","등","이번","내","총","전","나",
                       "속"] #삭제할 명사

    for n, c in count.most_common(ntags):
        if n not in value_to_remove:
            temp = {'tag': n, 'count': c}
            return_list.append(temp)
    # most_common 메소드는 정수를 입력받아 객체 안의 명사중 빈도수
    # 큰 명사부터 순서대로 입력받은 정수 갯수만큼 저장되어있는 객체 반환
    # 명사와 사용된 갯수를 return_list에 저장합니다.
    return return_list
示例#17
0
def keywords_extract(filename):
    f = codecs.open('./reviews/' + filename, "r", "utf-8")
    data = f.read()

    nlp = Twitter()
    nouns = nlp.nouns(data)

    for i in nouns:
        if i in dumb:
            nouns.remove(i)
            return nouns

    count = Counter(nouns)
    words = count.most_common(40)

    keyword = words[0:3]

    for i in keyword:
        word_freq_bid = [i[0], i[1], filename[:-4]]
        keywords.append(word_freq_bid)

    f.close()
    return keywords
def result(request):
    text = parse(request.GET['keyword'])
    words = text.split()

    nlpy = Twitter()
    nouns = nlpy.nouns(text)
    count = Counter(nouns)

    tag_count = []
    tags = []

    for n, c in count.most_common(100):
        dics = {'tag': n, 'count': c}
        if len(dics['tag']) >= 2 and len(tags) <= 49:
            tag_count.append(dics)
            tags.append(dics['tag'])

    return render(
        request, 'result.html', {
            'keyword': request.GET['keyword'],
            'full': text,
            'wordCount': len(words),
            'tags': tag_count
        })
示例#19
0
def tag_counting():
    nlpy = Twitter()
    nouns = nlpy.nouns(
        u'대학에서 DB, 통계학, 이산수학 등을 배웠지만..., 대학, 대학, DB, DB, 통계학, 배웠지만, DB, Db, 바보, 바보'
    )
    count = Counter(nouns)
    print(nouns)

    tag_count = []
    tags = []

    for n, c in count.most_common(100):
        dics = {'tag': n, 'count': c}
        if len(dics['tag']) >= 2 and len(tags) <= 49:
            tag_count.append(dics)
            tags.append(dics['tag'])

    for tag in tag_count:
        print(" {:<14}".format(tag['tag']), end='\t')
        print("{}".format(tag['count']))
        print("\n---------------------------------")
        print("     명사 총  {}개".format(len(tags)))
        print("---------------------------------\n\n")
        return tags
def get_tags(text, ntags = 100, multiplier = 1):
	'''워드클라우드를 그리기 위한 태그와 그 빈도수, 그리고 색깔 생성

	text.......: list나 tuple 형태의 텍스트 데이터
	ntags......: 태그의 개수로 설정한 기본값은 100
	multiplier.: 상대적인 크기를 사용자가 상수로 정한 변수로 기본값은 1
	Return.....: [색, 태그, 사이즈]로 이루어진 list
	'''
	t = Twitter()						# 트위트 형태소 분석기 변수 생성
	nouns = []							# 트위트 분석 결과 명사만 담을 list 생성
	for sentence in text:				# 텍스트는 list 혹은 tuple 형태로 입력
		for noun in t.nouns(sentence):	# list의 각 요소(sentence)를 for 문을 이용해 꺼낸 후에 형태소 분석한 후 명사만 추려내
			nouns.append(noun)			# 다시 for 문을 통해 그 명사들을 nouns list에 append함
#			if noun == '영화':			# '영화' 단어가 너문 자주 나올 것을 예상하고 이 단어 제외 할 때 사용하는 코드
#				pass
#			else:
#				nouns.append(noun)		# 다시 for 문을 통해 그 명사들을 nouns list에 append함
	count = Counter(nouns)				# Counter()를 통해 nouns 리스트에 있는 서로 다른 명사의 개수 구함 

	# n = 태그(워드클라우드에 포함되는 단어의 개수)
	# c = 각 태그의 발생 빈도 수
	# c랑 multiplier가 곱해져서 size가 결정딤 
	return [{'color':color(), 'tag':n, 'size': c * multiplier } \
					for n, c in count.most_common(ntags)]
示例#21
0
class SentenceTokenizer(object):
    def __init__(self):
        self.kkma = Kkma()
        self.twitter = Twitter()
        self.stopwords = ['무단전재', '세계일보', '바로가기', '국민일보', '기자', '를', '본문']
        #불용문자

    def t2s(self, text):
        sentences = self.kkma.sentences(text)  #텍스트에서 문장 추출
        for idx in range(0, len(sentences)):
            if len(sentences[idx]) <= 10:
                sentences[idx - 1] += (' ' + sentences[idx])
                sentences[idx] = ''
        return sentences

    def get_nouns(self, sentences):  #명사추출하기
        nouns = []
        for s in sentences:
            if s is not '':
                nouns.append(''.join([
                    noun for noun in self.twitter.nouns(str(s))
                    if noun not in self.stopwords and len(noun) > 1
                ]))
        return nouns
def extract_lda_noun(tmp_lda, category):
    lda_list = []
    for x in tmp_lda:
        for i in x:
            lda_list.append(i)

    counter = collections.Counter(lda_list)
    print(counter.most_common())

    counts_k = counter.most_common()
    topicnouns_k = [counts_k[i][0] for i in range(len(counts_k))]

    tw = Twitter()
    hannanum = Hannanum()
    kkma = Kkma()

    cnouns = []

    for i in range(len(topicnouns_k)):
        t = tw.nouns(topicnouns_k[i])
        h = hannanum.nouns(topicnouns_k[i])
        k = kkma.nouns(topicnouns_k[i])
        if h != [] and k != [] and t != []:
            if set(h) == set(h).intersection(set(k), set(t)):
                cnouns += h
                print(h, k, t)
            else:
                print('not in list', h, k, t)

    df = pd.DataFrame(cnouns)
    df.columns = ['noun']
    df['label'] = np.zeros(len(cnouns))

    df.to_csv('{}_lda_noun_result_no_label.csv'.format(category),
              header=True,
              encoding='cp949')
示例#23
0
class SentenceTokenizer(object):
    def __init__(self):
        self.twitter = Twitter()
        self.stopwords = [
            '중인', '만큼', '마찬가지', '꼬집었', "연합뉴스", "데일리", "동아일보", "중앙일보", "조선일보",
            "기자", "아", "휴", "아이구", "아이쿠", "아이고", "어", "나", "우리", "저희", "따라",
            "의해", "을", "를", "에", "의", "가", "억원", "원장", "때문", "가", "@", "권혜민",
            "이유지", "인턴", "측은", "중앙", "대해", "누가", "지금", "수만", "반면"
        ]

    def url2sentences(self, url):
        source_code = requests.get(url)
        plain_text = source_code.text
        soup = BeautifulSoup(plain_text, 'lxml')
        daum3 = soup.select("div > section")
        naver = soup.findAll("div", id="articleBodyContents")
        naver_enter = soup.findAll("div", id="articeBody")
        naver_sports = soup.findAll("div", id="newsEndContents")

        daum_t = soup.select(".head_view > .tit_view")
        naver_t = soup.select("div > #articleTitle")
        navere_t = soup.select(".end_tit")
        navers_t = soup.select(".news_headline > .title")

        self.origin_text = []
        text = ''
        sentences = []
        temp = []
        temp2 = []
        self.title = []

        for sent in daum_t:
            self.title = sent.text
        for sent in naver_t:
            self.title = sent.text
        for sent in navere_t:
            self.title = sent.text
        for sent in navers_t:
            self.title = sent.text

        for sent in daum3:
            for unused in soup.select("figcaption"):
                unused.decompose()
            text = sent.text
            temp2.extend(text.split(". "))
        for sent in temp2:
            temp.extend(sent.split("\n"))

        for sent in naver:
            for unused in soup.select("td > font"):
                unused.decompose()
            for unused in soup.findAll("a"):
                unused.decompose()
            for unused in soup.findAll("script"):
                unused.decompose()
            for unused in soup.findAll("span"):
                unused.replace_with('')
            for unused in soup.findAll("p"):
                unused.decompose()
            for unused in soup.findAll("strong"):
                unused.decompose()
            for unused in soup.findAll("br"):
                unused.replace_with('. ')

            text = sent.get_text()
            temp.extend(text.split('. '))

        for sent in naver_enter:
            for unused in soup.findAll("a"):
                unused.decompose()
            for unused in soup.findAll("script"):
                unused.decompose()
            for unused in soup.findAll("span"):
                unused.replace_with('')
            for unused in soup.findAll("p"):
                unused.decompose()
            for unused in soup.findAll("br"):
                unused.replace_with('. ')
            text = sent.get_text()
            temp.extend(text.split('. '))

        for sent in naver_sports:
            for unused in soup.findAll("a"):
                unused.decompose()
            for unused in soup.findAll("script"):
                unused.decompose()
            for unused in soup.findAll("span"):
                unused.replace_with('')
            for unused in soup.findAll("p"):
                unused.decompose()
            for unused in soup.findAll("br"):
                unused.replace_with('. ')
            text = sent.get_text()
            temp.extend(text.split('. '))

        sentences = self.makeSentences(temp)

        return sentences

    def text2sentences(self, text):
        self.origin_text = []
        self.title = ''
        jpype.attachThreadToJVM()
        temp_1 = text.split(". ")
        temp_2 = []
        temp_3 = []
        for sent in temp_1:
            temp_2 = sent.split("\r\n\r\n")
            temp_3.extend(temp_2)

        print(temp_3)

        sentences = self.makeSentences(temp_3)

        return sentences

    def makeSentences(self, new_temp):
        idx_r = []
        a = 0
        b = -1
        quotes = []
        temp = []
        new_string = ""

        for idx in range(len(new_temp)):
            if (new_temp[idx].count('\"') + new_temp[idx].count('“') +
                    new_temp[idx].count('”')) % 2 == 1:
                quotes.append(idx)

        for idx in range(len(new_temp)):
            if len(quotes) > 1:
                if idx < quotes[0]:
                    if len(new_temp[idx]) > 0 and new_temp[idx][-1] == '다':
                        new_temp[idx] += "."
                    temp.append(new_temp[idx])
                elif idx >= quotes[0] and idx < quotes[1]:
                    new_string += new_temp[idx]
                    new_string += ". "
                else:
                    new_string += new_temp[idx]
                    if new_string[-1] == '다':
                        new_string += "."
                    temp.append(new_string)
                    new_string = ""
                    quotes.pop(0)
                    quotes.pop(0)
            else:
                if len(new_temp[idx]) > 0 and new_temp[idx][-1] == '다':
                    new_temp[idx] += "."
                temp.append(new_temp[idx])

        for sent in temp:
            self.origin_text.append(sent)

        for idx in range(0, len(temp)):
            if not re.findall(regex, temp[idx]):
                idx_r.append(idx - a)
                a += 1

        for idx in idx_r:
            temp.pop(idx)

        sentences = temp

        for s in sentences[:]:
            if "@" in s:
                sentences.remove(s)

        for idx in sentences[:]:
            if len(idx) > 0:
                if idx[-1] != '.' or idx[len(idx) - 2] != '다':
                    sentences.remove(idx)

        return sentences

    def get_nouns(self, sentences):
        nouns = []
        jpype.attachThreadToJVM()
        for sentence in sentences:
            if sentence is not '':
                nouns.append(' '.join([
                    noun for noun in self.twitter.nouns(str(sentence))
                    if noun not in self.stopwords and len(noun) > 1
                ]))
        return nouns
				more2 = 1
			else :
				more2 = 0
		
		#중간에 뜨는 . 을 띄어쓰기로 바꾸기
		first = document.split('.')
		document2=''
		for ff in first:
			document2 = document2 + ' ' + ff

		# 공백 단위로 자르기
		split = document2.split()
		i = 0
		# 자르고 난 후에 같은 위치에 명사 합치기
		for word in split:
			pos = t.nouns(split[i])
			for word2 in pos:
				Compound = Compound+word2
			more = 1
			while(more == 1):
				pos2 = Twitter().pos(Compound)
				for poss in pos2:
					if poss[1] != 'Noun':
						more = 2
						pos3 = Twitter().nouns(Compound)
						Compound=''
						for word3 in pos3:
							Compound = Compound + word3
				if more == 2:
					more =1
				else:
示例#25
0
# 1. 이전 포스트에서 크롤링한 댓글파일을 읽기전용으로 호출함
file = open('./mujung.txt', 'r', encoding='utf-8')
lines = file.readlines()

# 2. 변수 okja에 전체댓글을 다시저장
okja = []
for line in lines:
    okja.append(line)
file.close()

twitter = Twitter()

# 4. 각 문장별로 형태소 구분하기
sentences_tag = []
for sentence in okja:
    morph = twitter.nouns(sentence)
    sentences_tag.append(morph)
    print(morph)
    print('-' * 30)

print(sentences_tag)
print(len(sentences_tag))
print('\n' * 3)

# 5. 명사 혹은 형용사인 품사만 선별해 리스트에 담기
# sentence = []
# for sentence1 in sentences_tag:
#     for word, tag in sentence1:
#         if tag in ['Noun','Adjective']:
#             sentence.append(word)
示例#26
0
            driver.implicitly_wait(2)
        except:
            print("no such element(2)")

    if more_cnt > 0:
        content = driver.find_elements_by_xpath('//*[@id="makingnotePhase"]')
        for i in content:
            lines += i.text
    else:
        driver.implicitly_wait(1)
    print(lines)

    #키워드 뽑기
    nouns = []
    keywords = []
    for noun in twitter.nouns(lines):
        if (len(noun) == 1 and noun in dic):
            nouns.append(noun)
        elif (len(noun) > 1):
            nouns.append(noun)
        count = Counter(nouns)

    cnt_common = count.most_common(6)  # 최다 빈출 키워드 6개까지 저장
    print("--------------------------")
    cnt = 0
    for a in cnt_common:
        cnt = cnt + 1
        if cnt == 1:
            keyword1 = a[0]
        elif cnt == 2:
            keyword2 = a[0]
示例#27
0
cursor = db.cursor(MySQLdb.cursors.DictCursor)
cursor.execute("set names utf8")

db.query("set character_set_connection=utf8;")
db.query("set character_set_server=utf8;")
db.query("set character_set_client=utf8;")
db.query("set character_set_results=utf8;")
db.query("set character_set_database=utf8;")

cursor.execute("set names utf8")
sql = "select * from Text3 where ArticleNumber=10000"
cursor.execute(sql.encode('utf8'))

row = cursor.fetchone()
document = row['Content'].decode('utf8')

lda_model_path = "/home/ice-kms/LDAModel/iter_1000_Real_lda_10000_pass_100_topicNum_20.lda"
lda = LdaModel.load(lda_model_path)

dictionary_path= "/home/ice-kms/LDAModel/iter_1000_Real_articleDic_10000_compound_topicNum_20.dict"
dictionary = corpora.Dictionary.load(dictionary_path)

ldaModel = lda.show_topics(num_topics=20, num_words=5, formatted=False)

tokens_ko = t.nouns(document)
dicko = dictionary.doc2bow(tokens_ko)
documentTopic = lda[dicko]

print(documentTopic)
示例#28
0
# -*- coding: utf-8 -*-

import operator

import codecs
with codecs.open("대통령_anger.txt", encoding='utf-8') as f:
    data = [line.split('\t') for line in f.read().splitlines()]

from konlpy.tag import Twitter
twitter = Twitter()

frequency = {}

for line in data:
    text = line[0]
    words = twitter.nouns(text)
    for word in words:
        count = frequency.get(word, 0)
        frequency[word] = count + 1

sorted_freq = sorted(frequency.items(),
                     key=operator.itemgetter(1),
                     reverse=True)

anger_frequency = open("대통령_anger_frequency.txt", 'w', -1, "utf-8")

for t in sorted_freq:
    anger_frequency.write(t[0] + '\t' + str(t[1]) + '\n')

anger_frequency.close()
示例#29
0
word = []
for t in data:
    T = t.split(' ')
    for t2 in T:
        word.append(t2)

word
# nouns = [t.decode('utf-8') for t in word]
count = Counter(word)
count

hannanum = Hannanum()
nouns = hannanum.nouns(data)

nlp = Twitter()
nouns = nlp.nouns(data)

count = Counter(nouns)

tag2 = count.most_common(40)
taglist = pytagcloud.make_tags(tag2, maxsize=80)

pytagcloud.create_tag_image(taglist, 'wordcloud.jpg', size=(900, 600), fontname='Nobile', rectangular=False)


#! /usr/bin/python2.7
# -*- coding: utf-8 -*-

from collections import Counter
import urllib
import random
示例#30
0
def get_Noun(text, count=30):
    twit = Twitter()
    noun = twit.nouns(text)
    return noun
示例#31
0
def analyzer( messages ) :

	# store senders in chat room
	sender_list = set()

	send_ratio = {}
	msg_bytes = {}
	sent_time = {}
	for i in range (0, 7) :
		sent_time[ i ] = {}
		for j in range(0,24) :
			sent_time[ i ][ j ] = 0	

	kcount = {}
	keywords = {}
	sent_month = ""
	temp_keywords = []

	emoticons = 0
	total = 0
	last_sender = ""

	

	intimacy = {}

	twitter = Twitter()

	for msg in messages :
		
		sender_list.add(msg.sender)

		# to calculate intimacy between member
		if len(last_sender) == 0 :
			last_sender = msg.sender
		if last_sender != msg.sender :
			td_increment( intimacy, last_sender, msg.sender, 1)
			td_increment( intimacy, msg.sender, last_sender, 1)
			last_sender = msg.sender

		# check send ratio.
		td_increment(send_ratio, str(msg.datetime)[:7], msg.sender, 1)

		# calculate msg bytes by sender
		td_increment(msg_bytes, str(msg.datetime)[:7], msg.sender, len(msg.contents))
		
		# count k in msg.
		increment(kcount, msg.sender, msg.contents.count(unicode('ㅋ','utf-8')))

		# count emoticons
		if "(emoticon)" in msg.contents or unicode('(이모티콘)', 'utf-8') in msg.contents:
			emoticons = emoticons + 1

		# calculate active time
		td_increment(sent_time, msg.datetime.weekday() , msg.datetime.time().hour, 1)

		# analyze keyword
		if ( is_msg_content(msg.contents) ) :
			if len(sent_month) == 0 :
				sent_month = str(msg.datetime)[:7]
			elif sent_month == str(msg.datetime)[:7] :
				temp_keywords.append(msg.contents)
			elif sent_month != str(msg.datetime)[:7] :
				keywords_list = twitter.nouns(msg.contents)
				for keyword in keywords_list :
					if len(keyword) > 1:
						td_increment(keywords, sent_month, keyword, 1)
				sent_month = str(msg.datetime)[:7]
				del temp_keywords[:]
				temp_keywords.append(msg.contents)

	# in case of 1:1 chat room
	if len(sender_list) == 2 :
		response_time = {}
		last_sender = ""
		last_response_time = timedelta(0)

		for sender in sender_list :
			response_time[sender] = []
		for msg in messages :
			if len(last_sender) == 0 :
				last_sender = msg.sender
			if last_sender != msg.sender :
				last_sender = msg.sender
				response_time[msg.sender].append(msg.datetime - last_response_time)

			last_response_time = msg.datetime


	print "Who sent how much messages? "

	for date in send_ratio :
		print "in " + str(date)
		for sender in send_ratio[date] :
			print str(sender) + " sent " + str(send_ratio[date][sender]) + " messages"
			total = total + int(send_ratio[date][sender])

	print ""

	print "Msg bytes : "

	for date in msg_bytes :
		print "in " + str(date)
		for sender in msg_bytes[date] :
			print str(sender) + " sent " + str(msg_bytes[date][sender]) + " bytes"

	print ""

	for sender in kcount :
		print sender + " wrote " + unicode('ㅋ','utf-8').encode('utf-8') + " " + str(kcount[sender]) + " byte times"

	print ""

	print ""


	# sorted keywords has 'list' type. not dict.
	print "Top 20 most frequently used keywords in your chatroom."
	for date in keywords :
		print "in " + date
 		sorted_keywords = sorted(keywords[date].items(), key=lambda x:x[1], reverse = True)
		for i in range(0,20) :
			try :
				print sorted_keywords[i][0] + " : " + str(sorted_keywords[i][1])
			except :
				pass

	print ""


	print "When is the most active moment in this chat room?"
	for week in sent_time :
		print week
		for hour in sorted(sent_time[week]):
			print str(sent_time[week][hour]) + " messages were sent at " + str(hour) + " o'clock"
		
	print ""

	print "you guys used emoticons " + str(emoticons) + " times"

	print ""

	print "intimacy between members"

	if len(sender_list) == 2 : 
		for sender in response_time : 
			print sender
			rt_average = sum(response_time[sender], timedelta()) / len(response_time[sender])
			print "responded in " + str(rt_average) + "in average"

	else : 
		for member in intimacy :
			print member + " : "
			for friends in intimacy[member] :
				print " - " + friends + " " + str(intimacy[member][friends])

	print ""

	print "totally, " + str(total) + " messages were sent"
示例#32
0
            for day in sad_date:
                if day[0] == (dt.month, dt.day):
                    day.append(line)
                    break

    n_dates = len(anger_date)
    for i in range(n_dates):
        EE = cEmotifyElement()
        day = anger_date[i]
        lines = day[1:]
        if len(lines) > 0:
            for line in lines:
                n_tweets = 1 + int(line[1])
                EE.modAnger(EE.Anger + n_tweets)
                text = line[0]
                words = twitter.nouns(text)
                for word in words:
                    if len(word) > 1:
                        count = EE.AngerKeyword.get(word, 0)
                        EE.addAnger(word, count + n_tweets)
                        lst = anger_tweets.get(word, [])
                        if len(lst) < 5:
                            if lst_name in text:
                                if not text in lst:
                                    lst.append(text)
                        anger_tweets[word] = lst

        day = fear_date[i]
        lines = day[1:]
        if len(lines) > 0:
            for line in lines:
示例#33
0
#f = open("polatics.txt", "r")
# current 500 articles
f = urllib2.urlopen("http://polatics.news/all").read().split('\n')
f.reverse()
f = f[0:400]

for i in f:
	print i 

print "line : %d" %(len(f))
f2 = open("polatics_out.txt", "w")

voca = {}

for line in f:
	for i in twitter.nouns(line):
		if i in voca:
			voca[i] += 1
		else:
			voca[i] = 0
	f2.write(str(twitter.nouns(line)))

#voca = sorted(voca.iteritems(), key=itemgetter(1), reverse=True)
voca =  sorted(voca.items(), key=operator.itemgetter(1), reverse=True) 

c = 0
ret = []
for k,v in voca:
	if len(k) > 1 and k != "단독":
		print k,v
		ret.append(k)
示例#34
0
def upload(request):
	if request.method == 'POST':
		if 'file' in request.FILES:
  			myUid = str(uuid.uuid4())

			dataChatroom = Chatroom(
				uid = myUid
			)
			dataChatroom.save()

			data = Chatroom.objects.get(uid=myUid) 
			chatroom_id = data.id

			file = request.FILES['file']
			filename = myUid		
			
			fp = open('%s/%s' % ("data", filename) , 'wb')
			for chunk in file.chunks():
				fp.write(chunk)
			fp.close()
			log_file = open('%s/%s' % ("data", filename) , 'r')
						
			messages = normalize( log_file )
			log_file.close()
			
			#파일 삭제
			os.remove('%s/%s' % ("data", filename))

			sender_list = set()
			send_ratio = {}
			msg_bytes = {}
			sent_time = {}
			sent_time = {}
			for i in range (0, 7) :
				sent_time[ i ] = {}
				for j in range(0,24) :
					sent_time[ i ][ j ] = 0	
			kcount = {}
			hcount = {}
			ucount = {}
			keywords = {}
			keywords_all = {}
			sent_month = ""
			temp_keywords = ""
			emoticons = 0
			total = 0
			last_sender = ""			
			intimacy = {}
			is_one_to_one = 0
			twitter = Twitter()
		
			for msg in messages :
				sender_list.add(msg.sender)

				# to calculate intimacy between member
				if len(last_sender) == 0 :
					last_sender = msg.sender
				if last_sender != msg.sender :
					td_increment( intimacy, last_sender, msg.sender, 1)
					td_increment( intimacy, msg.sender, last_sender, 1)
					last_sender = msg.sender
			
				# check send ratio.
				td_increment(send_ratio, str(msg.datetime)[:7], msg.sender, 1)
			
				# calculate msg bytes by sender
				td_increment(msg_bytes, str(msg.datetime)[:7], msg.sender, len(msg.contents))
				
				# count k in msg.
				increment(kcount, msg.sender, msg.contents.count(unicode('ㅋ','utf-8')))
				increment(hcount, msg.sender, msg.contents.count(unicode('ㅎ','utf-8')))
				increment(ucount, msg.sender, msg.contents.count(unicode('ㅠ','utf-8')))
			
				# count emoticons
				if "(emoticon)" in msg.contents or unicode('(이모티콘)', 'utf-8') in msg.contents:
					emoticons = emoticons + 1
			
				# calculate active time
				td_increment(sent_time, msg.datetime.weekday(), msg.datetime.time().hour, 1)
			
				# analyze keyword
				"""
				keywords_list = twitter.nouns(msg.contents)
				for keyword in keywords_list :
					if len(keyword) > 1:
						if ( is_msg_content(keyword) ):	
							td_increment(keywords_all, str(msg.datetime)[:7], keyword, 1)
							increment(keywords, keyword, 1)
				"""
				if len(sent_month) == 0 :
					sent_month = str(msg.datetime)[:7]
				
				if sent_month == str(msg.datetime)[:7] :
					temp_keywords = temp_keywords + " " + msg.contents 
				elif sent_month != str(msg.datetime)[:7] :
					keywords_list = twitter.nouns(temp_keywords)
					for keyword in keywords_list :
						if len(keyword) > 1:
							if ( is_msg_content(keyword) ) :
								td_increment(keywords_all, sent_month, keyword, 1)
								increment(keywords, keyword, 1)
					sent_month = str(msg.datetime)[:7]
					temp_keywords = msg.contents

			#마지막달은 위 for문에서 못 하니까 여기서 한번 더 함.
			keywords_list = twitter.nouns(temp_keywords)
			for keyword in keywords_list :
				if len(keyword) > 1:
					if ( is_msg_content(keyword) ) :
						td_increment(keywords_all, sent_month, keyword, 1)
						increment(keywords, keyword, 1)

			if len(sender_list) == 2 :
				response_time = {}
				last_sender = ""
				last_response_time = timedelta(0)

				for sender in sender_list :
					response_time[sender] = []
				for msg in messages :
					if len(last_sender) == 0 :
						last_sender = msg.sender
					if last_sender != msg.sender :
						last_sender = msg.sender
						response_time[msg.sender].append(msg.datetime - last_response_time)

					last_response_time = msg.datetime

			#insert frequency message & byte	
			for date in send_ratio :
				for sender in send_ratio[date] :
                                	dataMessage = FrequencyMessage(
                                		chatroom_id = chatroom_id,
						name = unicode(str(sender), 'utf-8').encode('utf-8'),
                                		date = date,
                                		count = int(send_ratio[date][sender]),
						bytes = int(msg_bytes[date][sender])
                        		)
                        		dataMessage.save()
			
			#insert all keywords
			cnt = 0
			for date in keywords_all :
				for keyword in keywords_all[date] :
					tasks.insert_keywords.delay(keyword, date, keywords_all[date][keyword])
					"""
					word = smart_str(keyword)
					cnt = cnt + 1
					getWordData = FrequencyWordAll.objects.filter(word=keyword, date=date)
					if getWordData.exists() :
						FrequencyWordAll.objects.filter(id=getWordData[0].id).update(count=F('count') + keywords_all[date][keyword])
					else :
						dataWordAll = FrequencyWordAll(
							date = date,
							word = word,
							count = int(keywords_all[date][keyword])
						)
						dataWordAll.save()
					"""
			#insert most keywords 20				
			sorted_keywords = sorted(keywords.items(), key=lambda x:x[1], reverse = True)
			for i in range(0,20) :
				try :
					word = smart_str(sorted_keywords[i][0])
					dataWord = FrequencyWord(
						chatroom_id = chatroom_id,
						word = word,
						count = int(sorted_keywords[i][1])
					)
					dataWord.save()
				except :
					pass
			
			#insert moment
			for week in sent_time :
				for hour in sent_time[week] :
					dateTime = FrequencyTime(
                                		chatroom_id = chatroom_id,
						week = int(week),
						hour = int(hour),
						count = int(sent_time[week][hour])
					)
					dateTime.save()
			if len(sender_list) == 2 :
				is_one_to_one = 1
				intimacy = {}
				for sender in response_time : 
					rt_average = sum(response_time[sender], timedelta()) / len(response_time[sender])
					td_increment( intimacy, sender, " ", rt_average.total_seconds())

			#insert intimacy
			for member in intimacy :
                                for friends in intimacy[member] :
					dataIntimacy = Intimacy(
                                		chatroom_id = chatroom_id,
						name = unicode(str(member), 'utf-8').encode('utf-8'),
						target = unicode(str(friends), 'utf-8').encode('utf-8'),
						count = int(intimacy[member][friends])
					)
                                	dataIntimacy.save()


			#insert each char count
			for sender in kcount :
				dataChar = FrequencyChars(
                                	chatroom_id = chatroom_id,
					name = unicode(str(sender), 'utf-8').encode('utf-8')
                                )
				try :
					dataChar.count_char_1 = int(kcount[sender])
				except :
					pass
				try :
                                        dataChar.count_char_2 = int(hcount[sender])
                                except :
                                        pass
				try :
                                        dataChar.count_char_3 = int(ucount[sender])
                                except :
                                        pass

                                dataChar.save()

			Chatroom.objects.filter(id=chatroom_id).update(complete_datetime=datetime.datetime.now(), is_one_to_one=is_one_to_one)
			return HttpResponse(myUid)
	return HttpResponse('Failed to Upload File')
示例#35
0
#plt.rc('font', family='AppleGothic')
import seaborn as sns

# 첫번쨰 커밋
# 두번째 커밋
# third commit
# 네번째 커밋

x_data = np.array([
    '영희가 사랑하는 강아지 백구를 산책시키고 있다.', '철수가 사랑하는 소 누렁이를 운동시키고 있다.',
    '영희와 철수는 소와 강아지를 산책 및 운동시키고 있다.'
])

twitter = Twitter()
for i, document in enumerate(x_data):
    nouns = twitter.nouns(document)
    x_data[i] = ' '.join(nouns)
print(x_data)  #['영희 사랑 강아지 백구 산책', '철수 사랑 소 누렁이 운동', '영희 철수 소 강아지 산책 및 운동']

vect = TfidfVectorizer()

x_data = vect.fit_transform(x_data)

cosine_similarity_matrix = (x_data * x_data.T)
print(cosine_similarity_matrix.shape)  #(3, 3)
print(cosine_similarity_matrix)
'''
  (0, 1)    0.19212485958220318
  (0, 2)    0.5605318467638107
  (0, 0)    0.9999999999999999
  (1, 2)    0.4113054999991637
示例#36
0
tw = Twitter()
# 트위터 형태소 분석기를 사용함
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

# 찾으려고 하는 상위 n개의 단어(명사)
_toFind_ = 30

# 문서 읽기
doc_ko = open('./k_tex2.txt').read()

# print(doc_ko)

# 명사만 추출
token_ko = tw.nouns(doc_ko)

# nltk 활용을 위한
res_ko = nltk.Text(token_ko, name=u'sutuk1')



print(len(res_ko.tokens))       # returns number of tokens (document length)
print(len(set(res_ko.tokens)))  # returns number of unique tokens
on_list = res_ko.vocab().most_common(_toFind_)

# on_list 는 리스트이다, most_common 이 리스트를 반환하는데 리스트는 튜플로 이루어져있다, 튜플은 첫번째인자로(0번쨰) 유니코드 스트링형을 갖고 두번째인자로(1번째) 몇번 빈출되었는지

# print(list(on_list[1])[0]) 테스트코드