class AnalysisDiction: """ This class is for analysis of korean texts using kkma and twitter dictionaries """ def __init__(self, on_kkma=False, on_twitter=False): # maybe move to init of analysis_app """ Allocate kkma or twitter diction instance :param on_kkma: kkma instance :param on_twitter: twitter instance """ if on_kkma is True: self.kkma = Kkma() if on_twitter is True: self.twitter = Twitter() def analyzer_kkma(self, string_data, mode): """ This method is for kkma. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._kkma """ if mode is 'morphs': return self.kkma.morphs(string_data) elif mode is 'nouns': return self.kkma.nouns(string_data) elif mode is 'pos': return self.kkma.pos(string_data) else: return False def analyzer_twitter(self, string_data, mode): """ This method is for twitter. It acts differently depends on its mode. :param string_data: String data for analysis :param mode: Analyze string data depending on its mode :return: Return its results. If have no mode in param , return false ref: http://konlpy.org/ko/v0.4.4/api/konlpy.tag/#module-konlpy.tag._twitter """ if mode is 'morphs': return self.twitter.morphs(string_data) elif mode is 'nouns': return self.twitter.nouns(string_data) elif mode is 'pos': return self.twitter.pos(string_data) elif mode is 'posmore': return self.twitter.pos(string_data, True, True) else: return False
def get_noun(self): print("[*] 명사 추출 시작") start_time = time.time() twitter = Twitter() for s in self.word_list: temp = twitter.nouns(s) for t in temp: self.noun_list.append(str(t)) end_time = time.time() print("[*] 명사 추출 완료(소요시간 : {0})".format(str((end_time-start_time)))) print("[*] 추출된 명사 길이 : {0}".format(str(len(self.noun_list)))) # 빈도 분석 count = Counter(self.noun_list) #tag = count.most_common( int(len(count)*(15/100)) ) tag = count.most_common(50) taglist = pytagcloud.make_tags(tag, maxsize=100) pytagcloud.create_tag_image(taglist, 'wordcloud.jpg', size=(800, 600), fontname='Nanum Gothic Coding', rectangular=False)
def keywords_function(result): # 키워드 추출 : https://dalulu.tistory.com/108 try: nlpy = Twitter() nouns = nlpy.nouns(result) count = Counter(nouns) tag_count = [] tags = [] for n, c in count.most_common(200): dics = {'tag': n, 'count': c} if len(dics['tag']) >= 2 and len(tags) <= 49: tag_count.append(dics) tags.append((dics['tag'], dics['count'])) return tags except: return []
def get_tags(text, max_count, min_length): t = Twitter() if max_count == None or min_length == None: max_count = 20 min_length = 1 nouns = t.nouns(text) processed = [noun for noun in nouns if len(noun) >= min_length] count = Counter(processed) result = {} for n, c in count.most_common(max_count): result[n] = c if len(result) == 0: result["내용이 없습니다."] = 1 return result
def get_tags(text, max_count, min_length): t = Twitter() # 한글 명사만 추출함. nouns = t.nouns(text) # 데이터를 전처리함. processed = [n for n in nouns if len(n) >= min_length] count = Counter(processed) result = {} # 출현 빈도가 높은 단어의 수를 max_count의 수 만큼 추출 for n, c in count.most_common(max_count): result[n] = c # 만약 추출된 단어가 1개도 존재하지 않을 경우 if len(result) == 0: result["내용이 없습니다."] = 1 return result
def make_korean(): global background, stopwords text = open(name, encoding='utf-8').read() t = Twitter() token = t.nouns(text) count_voca = nltk.Text(token, name="단어카운팅") count_voca.vocab() voca = count_voca.vocab().most_common(150) font = "/usr/share/fonts/NanumFont/NanumGothicBold.ttf" wordcloud = WordCloud(font_path=font, max_words=2000, relative_scaling=0.2, background_color="white", mask=background).generate_from_frequencies( dict(voca)) plt.figure(figsize=(12, 12)) plt.imshow(wordcloud, interpolation='bilinear') # plt.imshow(wordcloud.recolor(color_func=grey_color, random_state=3), interpolation='bilinear') plt.axis('off') plt.show()
def make_wordcloud(cmt_set): print("before nlp") nlp = Twitter() print("nlp = Twitter()") nouns = nlp.nouns(cmt_set) print("nouns = nlp.nouns(buff)") count = Counter(nouns) print("count = Counter(nouns)") word_cloud = WordCloud( font_path='./HoonWhitecatR.ttf', max_words=20, font_step=5, mode='RGBA', background_color=None).generate_from_frequencies(count) print(word_cloud.words_) del ( nlp, nouns, count ) # to solve memory error, 'nlp' make error when it work more then one time return word_cloud #need to make save img and return location
def main(): df = pd.read_csv(r'C:\Users\Hajin_2\python\python\BigData_prj\news.csv') titles = "" for i in df.title: titles = titles + i nlp = Twitter() nouns = nlp.nouns(titles) count = Counter(nouns) wordInfo = dict() for tags, counts in count.most_common(50): if (len(str(tags)) > 1): wordInfo[tags] = counts print("%s : %d" % (tags, counts)) showGraph(wordInfo) wordCloud(wordInfo)
def lexranker(text,code,date): text = text.replace('\\n','.') text2 = re.sub('[^가-힝0-9a-zA-Z\\s\\.]', '', text) lexrank =LexRank() #print(text2) lexrank.summarize(text2) summaries = lexrank.probe(3) word = Twitter() out = [] print(summaries) for summary in summaries: out += word.nouns(summary) word = list(set(out)) share = Share(code) startprice = share.get_open() endprice = share.get_price() for part in word: save_record(part, code, startprice, endprice, date)
def main(): openFileName = 'KakaoTalk' cloudImagePath = openFileName + '.jpg' rfile = open('KakaoTalk.txt', 'rt', encoding='UTF8') message = rfile.read() # [CODE 4] nlp = Twitter() nouns = nlp.nouns(message) count = Counter(nouns) # [CODE 5] wordInfo = dict() for tags, counts in count.most_common(50): if (len(str(tags)) > 1): wordInfo[tags] = counts print("%s : %d" % (tags, counts)) showGraph(wordInfo) saveWordCloud(wordInfo, cloudImagePath)
def get_tags(self, text, date, ntags=100): spliter = Twitter() # konlpy의 Twitter객체 frequency = {} nouns = spliter.nouns(text) # nouns 함수를 통해서 text에서 명사만 분리/추출 match_pattern = re.findall(r'\b[a-zA-Z]{3,15}\b', text) count = Counter(nouns) # Counter객체를 생성하고 참조변수 nouns할당 return_list = [] # 명사 빈도수 저장할 변수 for word in match_pattern: count1 = frequency.get(word, 0) frequency[word] = count1 + 1 frequency_list = frequency.keys() for words in frequency_list: temp_en = {'tag': words, 'date': date} return_list.append(temp_en) for n, c in count.most_common(ntags): temp_ko = {'tag': n, 'date': date} return_list.append(temp_ko) return return_list
def get_tags(open_text_file): nlp = Twitter() nouns_list = [] toekn_list = [] i = 0 for line in open_text_file: # for line in tqdm(open_text_file): print(line) text = line text = regex.sub(u"[\n]", " ", text) n = nlp.nouns(text) token = nlp.morphs(text) for value in n: nouns_list.append(value) for j in token: toekn_list.append(j) # if i == 400: # break # else: # i+=1 return nouns_list, toekn_list
class SentenceTokenizer(object): def __init__(self): self.kkma = Kkma() self.twitter = Twitter() self.stopwords = ['중인' ,'만큼', '마찬가지', '꼬집었', "연합뉴스", "데일리", "동아일보", "중앙일보", "조선일보", "기자" ,"아", "휴", "아이구", "아이쿠", "아이고", "어", "나", "우리", "저희", "따라", "의해", "을", "를", "에", "의", "가", "JTBC", ] # url 주소를 받아 기사내용을 추출 후 sentence를 return def url2sentences(self, url): article = Article(url, language='ko') article.download() article.parse() sentences = self.kkma.sentences(article.text) for idx in range(0, len(sentences)): if len(sentences[idx]) <= 10: sentences[idx-1] += (' ' + sentences[idx]) sentences[idx] = '' return sentences # text를 입력받아 setences를 return def text2sentences(self, text): sentences = self.kkma.sentences(text) for idx in range(0, len(sentences)): if len(sentences[idx]) <= 10: sentences[idx-1] += (' ' + sentences[idx]) sentences[idx] = '' return sentences # sentence를 받아 nouns를 return def get_nouns(self, sentences): nouns = [] for sentence in sentences: if sentence is not '': nouns.append(' '.join([noun for noun in self.twitter.nouns(str(sentence)) if noun not in self.stopwords and len(noun) > 1])) return nouns
def main(): # kkma = Kkma() # hannanum = Hannanum() # mecab = Mecab() twitter = Twitter() sentences = [] words = [] with open('result') as f: for line in f: sentences.append(line) for s in sentences: words.extend(twitter.nouns(s)) # words.extend(mecab.nouns(s)) # words.extend(kkma.nouns(s)) # words.extend(hannanum.nouns(s)) od = OrderedDict(Counter(words).most_common(30)) for k, v in od.items(): if k in IGNORE_WORDS: continue print(k, '\t', v)
class MakeWordCloud: def __init__(self, data_list): self.category_list = data_list.keys() self.word_list = data_list self.nlp = Twitter() self.cnt = 0 def make_Word_Cloud(self): for data in self.word_list: nouns = self.nlp.nouns(self.word_list.get(data)) nouns = [x for x in nouns if len(x) != 1] count = Counter(nouns) tags = count.most_common(40) taglist = pytagcloud.make_tags(tags, minsize=15, maxsize=50) fileName = './img/wordcloud' + str(self.cnt) + '.jpg' pytagcloud.create_tag_image(taglist, fileName, size=(600, 600), fontname='korean', rectangular=True) self.cnt += 1
def get_tags(text, ntags): spliter = Twitter() # konlpy의 Twitter객체 nouns = spliter.nouns(text) # nouns 함수를 통해서 text에서 명사만 분리/추출 count = Counter(nouns) # Counter객체를 생성하고 참조변수 nouns할당 return_list = [] # 명사 빈도수 저장할 변수 value_to_remove = ["곳","수","것","맛","때","정말","더","저","진짜","바로", "이","여기","날","꼭","안","거","그","또","저희","정도", "제","좀","타고","요","보고","그냥","중","때문","조금", "아주","다른","듯","쪽","등","이번","내","총","전","나", "속"] #삭제할 명사 for n, c in count.most_common(ntags): if n not in value_to_remove: temp = {'tag': n, 'count': c} return_list.append(temp) # most_common 메소드는 정수를 입력받아 객체 안의 명사중 빈도수 # 큰 명사부터 순서대로 입력받은 정수 갯수만큼 저장되어있는 객체 반환 # 명사와 사용된 갯수를 return_list에 저장합니다. return return_list
def keywords_extract(filename): f = codecs.open('./reviews/' + filename, "r", "utf-8") data = f.read() nlp = Twitter() nouns = nlp.nouns(data) for i in nouns: if i in dumb: nouns.remove(i) return nouns count = Counter(nouns) words = count.most_common(40) keyword = words[0:3] for i in keyword: word_freq_bid = [i[0], i[1], filename[:-4]] keywords.append(word_freq_bid) f.close() return keywords
def result(request): text = parse(request.GET['keyword']) words = text.split() nlpy = Twitter() nouns = nlpy.nouns(text) count = Counter(nouns) tag_count = [] tags = [] for n, c in count.most_common(100): dics = {'tag': n, 'count': c} if len(dics['tag']) >= 2 and len(tags) <= 49: tag_count.append(dics) tags.append(dics['tag']) return render( request, 'result.html', { 'keyword': request.GET['keyword'], 'full': text, 'wordCount': len(words), 'tags': tag_count })
def tag_counting(): nlpy = Twitter() nouns = nlpy.nouns( u'대학에서 DB, 통계학, 이산수학 등을 배웠지만..., 대학, 대학, DB, DB, 통계학, 배웠지만, DB, Db, 바보, 바보' ) count = Counter(nouns) print(nouns) tag_count = [] tags = [] for n, c in count.most_common(100): dics = {'tag': n, 'count': c} if len(dics['tag']) >= 2 and len(tags) <= 49: tag_count.append(dics) tags.append(dics['tag']) for tag in tag_count: print(" {:<14}".format(tag['tag']), end='\t') print("{}".format(tag['count'])) print("\n---------------------------------") print(" 명사 총 {}개".format(len(tags))) print("---------------------------------\n\n") return tags
def get_tags(text, ntags = 100, multiplier = 1): '''워드클라우드를 그리기 위한 태그와 그 빈도수, 그리고 색깔 생성 text.......: list나 tuple 형태의 텍스트 데이터 ntags......: 태그의 개수로 설정한 기본값은 100 multiplier.: 상대적인 크기를 사용자가 상수로 정한 변수로 기본값은 1 Return.....: [색, 태그, 사이즈]로 이루어진 list ''' t = Twitter() # 트위트 형태소 분석기 변수 생성 nouns = [] # 트위트 분석 결과 명사만 담을 list 생성 for sentence in text: # 텍스트는 list 혹은 tuple 형태로 입력 for noun in t.nouns(sentence): # list의 각 요소(sentence)를 for 문을 이용해 꺼낸 후에 형태소 분석한 후 명사만 추려내 nouns.append(noun) # 다시 for 문을 통해 그 명사들을 nouns list에 append함 # if noun == '영화': # '영화' 단어가 너문 자주 나올 것을 예상하고 이 단어 제외 할 때 사용하는 코드 # pass # else: # nouns.append(noun) # 다시 for 문을 통해 그 명사들을 nouns list에 append함 count = Counter(nouns) # Counter()를 통해 nouns 리스트에 있는 서로 다른 명사의 개수 구함 # n = 태그(워드클라우드에 포함되는 단어의 개수) # c = 각 태그의 발생 빈도 수 # c랑 multiplier가 곱해져서 size가 결정딤 return [{'color':color(), 'tag':n, 'size': c * multiplier } \ for n, c in count.most_common(ntags)]
class SentenceTokenizer(object): def __init__(self): self.kkma = Kkma() self.twitter = Twitter() self.stopwords = ['무단전재', '세계일보', '바로가기', '국민일보', '기자', '를', '본문'] #불용문자 def t2s(self, text): sentences = self.kkma.sentences(text) #텍스트에서 문장 추출 for idx in range(0, len(sentences)): if len(sentences[idx]) <= 10: sentences[idx - 1] += (' ' + sentences[idx]) sentences[idx] = '' return sentences def get_nouns(self, sentences): #명사추출하기 nouns = [] for s in sentences: if s is not '': nouns.append(''.join([ noun for noun in self.twitter.nouns(str(s)) if noun not in self.stopwords and len(noun) > 1 ])) return nouns
def extract_lda_noun(tmp_lda, category): lda_list = [] for x in tmp_lda: for i in x: lda_list.append(i) counter = collections.Counter(lda_list) print(counter.most_common()) counts_k = counter.most_common() topicnouns_k = [counts_k[i][0] for i in range(len(counts_k))] tw = Twitter() hannanum = Hannanum() kkma = Kkma() cnouns = [] for i in range(len(topicnouns_k)): t = tw.nouns(topicnouns_k[i]) h = hannanum.nouns(topicnouns_k[i]) k = kkma.nouns(topicnouns_k[i]) if h != [] and k != [] and t != []: if set(h) == set(h).intersection(set(k), set(t)): cnouns += h print(h, k, t) else: print('not in list', h, k, t) df = pd.DataFrame(cnouns) df.columns = ['noun'] df['label'] = np.zeros(len(cnouns)) df.to_csv('{}_lda_noun_result_no_label.csv'.format(category), header=True, encoding='cp949')
class SentenceTokenizer(object): def __init__(self): self.twitter = Twitter() self.stopwords = [ '중인', '만큼', '마찬가지', '꼬집었', "연합뉴스", "데일리", "동아일보", "중앙일보", "조선일보", "기자", "아", "휴", "아이구", "아이쿠", "아이고", "어", "나", "우리", "저희", "따라", "의해", "을", "를", "에", "의", "가", "억원", "원장", "때문", "가", "@", "권혜민", "이유지", "인턴", "측은", "중앙", "대해", "누가", "지금", "수만", "반면" ] def url2sentences(self, url): source_code = requests.get(url) plain_text = source_code.text soup = BeautifulSoup(plain_text, 'lxml') daum3 = soup.select("div > section") naver = soup.findAll("div", id="articleBodyContents") naver_enter = soup.findAll("div", id="articeBody") naver_sports = soup.findAll("div", id="newsEndContents") daum_t = soup.select(".head_view > .tit_view") naver_t = soup.select("div > #articleTitle") navere_t = soup.select(".end_tit") navers_t = soup.select(".news_headline > .title") self.origin_text = [] text = '' sentences = [] temp = [] temp2 = [] self.title = [] for sent in daum_t: self.title = sent.text for sent in naver_t: self.title = sent.text for sent in navere_t: self.title = sent.text for sent in navers_t: self.title = sent.text for sent in daum3: for unused in soup.select("figcaption"): unused.decompose() text = sent.text temp2.extend(text.split(". ")) for sent in temp2: temp.extend(sent.split("\n")) for sent in naver: for unused in soup.select("td > font"): unused.decompose() for unused in soup.findAll("a"): unused.decompose() for unused in soup.findAll("script"): unused.decompose() for unused in soup.findAll("span"): unused.replace_with('') for unused in soup.findAll("p"): unused.decompose() for unused in soup.findAll("strong"): unused.decompose() for unused in soup.findAll("br"): unused.replace_with('. ') text = sent.get_text() temp.extend(text.split('. ')) for sent in naver_enter: for unused in soup.findAll("a"): unused.decompose() for unused in soup.findAll("script"): unused.decompose() for unused in soup.findAll("span"): unused.replace_with('') for unused in soup.findAll("p"): unused.decompose() for unused in soup.findAll("br"): unused.replace_with('. ') text = sent.get_text() temp.extend(text.split('. ')) for sent in naver_sports: for unused in soup.findAll("a"): unused.decompose() for unused in soup.findAll("script"): unused.decompose() for unused in soup.findAll("span"): unused.replace_with('') for unused in soup.findAll("p"): unused.decompose() for unused in soup.findAll("br"): unused.replace_with('. ') text = sent.get_text() temp.extend(text.split('. ')) sentences = self.makeSentences(temp) return sentences def text2sentences(self, text): self.origin_text = [] self.title = '' jpype.attachThreadToJVM() temp_1 = text.split(". ") temp_2 = [] temp_3 = [] for sent in temp_1: temp_2 = sent.split("\r\n\r\n") temp_3.extend(temp_2) print(temp_3) sentences = self.makeSentences(temp_3) return sentences def makeSentences(self, new_temp): idx_r = [] a = 0 b = -1 quotes = [] temp = [] new_string = "" for idx in range(len(new_temp)): if (new_temp[idx].count('\"') + new_temp[idx].count('“') + new_temp[idx].count('”')) % 2 == 1: quotes.append(idx) for idx in range(len(new_temp)): if len(quotes) > 1: if idx < quotes[0]: if len(new_temp[idx]) > 0 and new_temp[idx][-1] == '다': new_temp[idx] += "." temp.append(new_temp[idx]) elif idx >= quotes[0] and idx < quotes[1]: new_string += new_temp[idx] new_string += ". " else: new_string += new_temp[idx] if new_string[-1] == '다': new_string += "." temp.append(new_string) new_string = "" quotes.pop(0) quotes.pop(0) else: if len(new_temp[idx]) > 0 and new_temp[idx][-1] == '다': new_temp[idx] += "." temp.append(new_temp[idx]) for sent in temp: self.origin_text.append(sent) for idx in range(0, len(temp)): if not re.findall(regex, temp[idx]): idx_r.append(idx - a) a += 1 for idx in idx_r: temp.pop(idx) sentences = temp for s in sentences[:]: if "@" in s: sentences.remove(s) for idx in sentences[:]: if len(idx) > 0: if idx[-1] != '.' or idx[len(idx) - 2] != '다': sentences.remove(idx) return sentences def get_nouns(self, sentences): nouns = [] jpype.attachThreadToJVM() for sentence in sentences: if sentence is not '': nouns.append(' '.join([ noun for noun in self.twitter.nouns(str(sentence)) if noun not in self.stopwords and len(noun) > 1 ])) return nouns
more2 = 1 else : more2 = 0 #중간에 뜨는 . 을 띄어쓰기로 바꾸기 first = document.split('.') document2='' for ff in first: document2 = document2 + ' ' + ff # 공백 단위로 자르기 split = document2.split() i = 0 # 자르고 난 후에 같은 위치에 명사 합치기 for word in split: pos = t.nouns(split[i]) for word2 in pos: Compound = Compound+word2 more = 1 while(more == 1): pos2 = Twitter().pos(Compound) for poss in pos2: if poss[1] != 'Noun': more = 2 pos3 = Twitter().nouns(Compound) Compound='' for word3 in pos3: Compound = Compound + word3 if more == 2: more =1 else:
# 1. 이전 포스트에서 크롤링한 댓글파일을 읽기전용으로 호출함 file = open('./mujung.txt', 'r', encoding='utf-8') lines = file.readlines() # 2. 변수 okja에 전체댓글을 다시저장 okja = [] for line in lines: okja.append(line) file.close() twitter = Twitter() # 4. 각 문장별로 형태소 구분하기 sentences_tag = [] for sentence in okja: morph = twitter.nouns(sentence) sentences_tag.append(morph) print(morph) print('-' * 30) print(sentences_tag) print(len(sentences_tag)) print('\n' * 3) # 5. 명사 혹은 형용사인 품사만 선별해 리스트에 담기 # sentence = [] # for sentence1 in sentences_tag: # for word, tag in sentence1: # if tag in ['Noun','Adjective']: # sentence.append(word)
driver.implicitly_wait(2) except: print("no such element(2)") if more_cnt > 0: content = driver.find_elements_by_xpath('//*[@id="makingnotePhase"]') for i in content: lines += i.text else: driver.implicitly_wait(1) print(lines) #키워드 뽑기 nouns = [] keywords = [] for noun in twitter.nouns(lines): if (len(noun) == 1 and noun in dic): nouns.append(noun) elif (len(noun) > 1): nouns.append(noun) count = Counter(nouns) cnt_common = count.most_common(6) # 최다 빈출 키워드 6개까지 저장 print("--------------------------") cnt = 0 for a in cnt_common: cnt = cnt + 1 if cnt == 1: keyword1 = a[0] elif cnt == 2: keyword2 = a[0]
cursor = db.cursor(MySQLdb.cursors.DictCursor) cursor.execute("set names utf8") db.query("set character_set_connection=utf8;") db.query("set character_set_server=utf8;") db.query("set character_set_client=utf8;") db.query("set character_set_results=utf8;") db.query("set character_set_database=utf8;") cursor.execute("set names utf8") sql = "select * from Text3 where ArticleNumber=10000" cursor.execute(sql.encode('utf8')) row = cursor.fetchone() document = row['Content'].decode('utf8') lda_model_path = "/home/ice-kms/LDAModel/iter_1000_Real_lda_10000_pass_100_topicNum_20.lda" lda = LdaModel.load(lda_model_path) dictionary_path= "/home/ice-kms/LDAModel/iter_1000_Real_articleDic_10000_compound_topicNum_20.dict" dictionary = corpora.Dictionary.load(dictionary_path) ldaModel = lda.show_topics(num_topics=20, num_words=5, formatted=False) tokens_ko = t.nouns(document) dicko = dictionary.doc2bow(tokens_ko) documentTopic = lda[dicko] print(documentTopic)
# -*- coding: utf-8 -*- import operator import codecs with codecs.open("대통령_anger.txt", encoding='utf-8') as f: data = [line.split('\t') for line in f.read().splitlines()] from konlpy.tag import Twitter twitter = Twitter() frequency = {} for line in data: text = line[0] words = twitter.nouns(text) for word in words: count = frequency.get(word, 0) frequency[word] = count + 1 sorted_freq = sorted(frequency.items(), key=operator.itemgetter(1), reverse=True) anger_frequency = open("대통령_anger_frequency.txt", 'w', -1, "utf-8") for t in sorted_freq: anger_frequency.write(t[0] + '\t' + str(t[1]) + '\n') anger_frequency.close()
word = [] for t in data: T = t.split(' ') for t2 in T: word.append(t2) word # nouns = [t.decode('utf-8') for t in word] count = Counter(word) count hannanum = Hannanum() nouns = hannanum.nouns(data) nlp = Twitter() nouns = nlp.nouns(data) count = Counter(nouns) tag2 = count.most_common(40) taglist = pytagcloud.make_tags(tag2, maxsize=80) pytagcloud.create_tag_image(taglist, 'wordcloud.jpg', size=(900, 600), fontname='Nobile', rectangular=False) #! /usr/bin/python2.7 # -*- coding: utf-8 -*- from collections import Counter import urllib import random
def get_Noun(text, count=30): twit = Twitter() noun = twit.nouns(text) return noun
def analyzer( messages ) : # store senders in chat room sender_list = set() send_ratio = {} msg_bytes = {} sent_time = {} for i in range (0, 7) : sent_time[ i ] = {} for j in range(0,24) : sent_time[ i ][ j ] = 0 kcount = {} keywords = {} sent_month = "" temp_keywords = [] emoticons = 0 total = 0 last_sender = "" intimacy = {} twitter = Twitter() for msg in messages : sender_list.add(msg.sender) # to calculate intimacy between member if len(last_sender) == 0 : last_sender = msg.sender if last_sender != msg.sender : td_increment( intimacy, last_sender, msg.sender, 1) td_increment( intimacy, msg.sender, last_sender, 1) last_sender = msg.sender # check send ratio. td_increment(send_ratio, str(msg.datetime)[:7], msg.sender, 1) # calculate msg bytes by sender td_increment(msg_bytes, str(msg.datetime)[:7], msg.sender, len(msg.contents)) # count k in msg. increment(kcount, msg.sender, msg.contents.count(unicode('ㅋ','utf-8'))) # count emoticons if "(emoticon)" in msg.contents or unicode('(이모티콘)', 'utf-8') in msg.contents: emoticons = emoticons + 1 # calculate active time td_increment(sent_time, msg.datetime.weekday() , msg.datetime.time().hour, 1) # analyze keyword if ( is_msg_content(msg.contents) ) : if len(sent_month) == 0 : sent_month = str(msg.datetime)[:7] elif sent_month == str(msg.datetime)[:7] : temp_keywords.append(msg.contents) elif sent_month != str(msg.datetime)[:7] : keywords_list = twitter.nouns(msg.contents) for keyword in keywords_list : if len(keyword) > 1: td_increment(keywords, sent_month, keyword, 1) sent_month = str(msg.datetime)[:7] del temp_keywords[:] temp_keywords.append(msg.contents) # in case of 1:1 chat room if len(sender_list) == 2 : response_time = {} last_sender = "" last_response_time = timedelta(0) for sender in sender_list : response_time[sender] = [] for msg in messages : if len(last_sender) == 0 : last_sender = msg.sender if last_sender != msg.sender : last_sender = msg.sender response_time[msg.sender].append(msg.datetime - last_response_time) last_response_time = msg.datetime print "Who sent how much messages? " for date in send_ratio : print "in " + str(date) for sender in send_ratio[date] : print str(sender) + " sent " + str(send_ratio[date][sender]) + " messages" total = total + int(send_ratio[date][sender]) print "" print "Msg bytes : " for date in msg_bytes : print "in " + str(date) for sender in msg_bytes[date] : print str(sender) + " sent " + str(msg_bytes[date][sender]) + " bytes" print "" for sender in kcount : print sender + " wrote " + unicode('ㅋ','utf-8').encode('utf-8') + " " + str(kcount[sender]) + " byte times" print "" print "" # sorted keywords has 'list' type. not dict. print "Top 20 most frequently used keywords in your chatroom." for date in keywords : print "in " + date sorted_keywords = sorted(keywords[date].items(), key=lambda x:x[1], reverse = True) for i in range(0,20) : try : print sorted_keywords[i][0] + " : " + str(sorted_keywords[i][1]) except : pass print "" print "When is the most active moment in this chat room?" for week in sent_time : print week for hour in sorted(sent_time[week]): print str(sent_time[week][hour]) + " messages were sent at " + str(hour) + " o'clock" print "" print "you guys used emoticons " + str(emoticons) + " times" print "" print "intimacy between members" if len(sender_list) == 2 : for sender in response_time : print sender rt_average = sum(response_time[sender], timedelta()) / len(response_time[sender]) print "responded in " + str(rt_average) + "in average" else : for member in intimacy : print member + " : " for friends in intimacy[member] : print " - " + friends + " " + str(intimacy[member][friends]) print "" print "totally, " + str(total) + " messages were sent"
for day in sad_date: if day[0] == (dt.month, dt.day): day.append(line) break n_dates = len(anger_date) for i in range(n_dates): EE = cEmotifyElement() day = anger_date[i] lines = day[1:] if len(lines) > 0: for line in lines: n_tweets = 1 + int(line[1]) EE.modAnger(EE.Anger + n_tweets) text = line[0] words = twitter.nouns(text) for word in words: if len(word) > 1: count = EE.AngerKeyword.get(word, 0) EE.addAnger(word, count + n_tweets) lst = anger_tweets.get(word, []) if len(lst) < 5: if lst_name in text: if not text in lst: lst.append(text) anger_tweets[word] = lst day = fear_date[i] lines = day[1:] if len(lines) > 0: for line in lines:
#f = open("polatics.txt", "r") # current 500 articles f = urllib2.urlopen("http://polatics.news/all").read().split('\n') f.reverse() f = f[0:400] for i in f: print i print "line : %d" %(len(f)) f2 = open("polatics_out.txt", "w") voca = {} for line in f: for i in twitter.nouns(line): if i in voca: voca[i] += 1 else: voca[i] = 0 f2.write(str(twitter.nouns(line))) #voca = sorted(voca.iteritems(), key=itemgetter(1), reverse=True) voca = sorted(voca.items(), key=operator.itemgetter(1), reverse=True) c = 0 ret = [] for k,v in voca: if len(k) > 1 and k != "단독": print k,v ret.append(k)
def upload(request): if request.method == 'POST': if 'file' in request.FILES: myUid = str(uuid.uuid4()) dataChatroom = Chatroom( uid = myUid ) dataChatroom.save() data = Chatroom.objects.get(uid=myUid) chatroom_id = data.id file = request.FILES['file'] filename = myUid fp = open('%s/%s' % ("data", filename) , 'wb') for chunk in file.chunks(): fp.write(chunk) fp.close() log_file = open('%s/%s' % ("data", filename) , 'r') messages = normalize( log_file ) log_file.close() #파일 삭제 os.remove('%s/%s' % ("data", filename)) sender_list = set() send_ratio = {} msg_bytes = {} sent_time = {} sent_time = {} for i in range (0, 7) : sent_time[ i ] = {} for j in range(0,24) : sent_time[ i ][ j ] = 0 kcount = {} hcount = {} ucount = {} keywords = {} keywords_all = {} sent_month = "" temp_keywords = "" emoticons = 0 total = 0 last_sender = "" intimacy = {} is_one_to_one = 0 twitter = Twitter() for msg in messages : sender_list.add(msg.sender) # to calculate intimacy between member if len(last_sender) == 0 : last_sender = msg.sender if last_sender != msg.sender : td_increment( intimacy, last_sender, msg.sender, 1) td_increment( intimacy, msg.sender, last_sender, 1) last_sender = msg.sender # check send ratio. td_increment(send_ratio, str(msg.datetime)[:7], msg.sender, 1) # calculate msg bytes by sender td_increment(msg_bytes, str(msg.datetime)[:7], msg.sender, len(msg.contents)) # count k in msg. increment(kcount, msg.sender, msg.contents.count(unicode('ㅋ','utf-8'))) increment(hcount, msg.sender, msg.contents.count(unicode('ㅎ','utf-8'))) increment(ucount, msg.sender, msg.contents.count(unicode('ㅠ','utf-8'))) # count emoticons if "(emoticon)" in msg.contents or unicode('(이모티콘)', 'utf-8') in msg.contents: emoticons = emoticons + 1 # calculate active time td_increment(sent_time, msg.datetime.weekday(), msg.datetime.time().hour, 1) # analyze keyword """ keywords_list = twitter.nouns(msg.contents) for keyword in keywords_list : if len(keyword) > 1: if ( is_msg_content(keyword) ): td_increment(keywords_all, str(msg.datetime)[:7], keyword, 1) increment(keywords, keyword, 1) """ if len(sent_month) == 0 : sent_month = str(msg.datetime)[:7] if sent_month == str(msg.datetime)[:7] : temp_keywords = temp_keywords + " " + msg.contents elif sent_month != str(msg.datetime)[:7] : keywords_list = twitter.nouns(temp_keywords) for keyword in keywords_list : if len(keyword) > 1: if ( is_msg_content(keyword) ) : td_increment(keywords_all, sent_month, keyword, 1) increment(keywords, keyword, 1) sent_month = str(msg.datetime)[:7] temp_keywords = msg.contents #마지막달은 위 for문에서 못 하니까 여기서 한번 더 함. keywords_list = twitter.nouns(temp_keywords) for keyword in keywords_list : if len(keyword) > 1: if ( is_msg_content(keyword) ) : td_increment(keywords_all, sent_month, keyword, 1) increment(keywords, keyword, 1) if len(sender_list) == 2 : response_time = {} last_sender = "" last_response_time = timedelta(0) for sender in sender_list : response_time[sender] = [] for msg in messages : if len(last_sender) == 0 : last_sender = msg.sender if last_sender != msg.sender : last_sender = msg.sender response_time[msg.sender].append(msg.datetime - last_response_time) last_response_time = msg.datetime #insert frequency message & byte for date in send_ratio : for sender in send_ratio[date] : dataMessage = FrequencyMessage( chatroom_id = chatroom_id, name = unicode(str(sender), 'utf-8').encode('utf-8'), date = date, count = int(send_ratio[date][sender]), bytes = int(msg_bytes[date][sender]) ) dataMessage.save() #insert all keywords cnt = 0 for date in keywords_all : for keyword in keywords_all[date] : tasks.insert_keywords.delay(keyword, date, keywords_all[date][keyword]) """ word = smart_str(keyword) cnt = cnt + 1 getWordData = FrequencyWordAll.objects.filter(word=keyword, date=date) if getWordData.exists() : FrequencyWordAll.objects.filter(id=getWordData[0].id).update(count=F('count') + keywords_all[date][keyword]) else : dataWordAll = FrequencyWordAll( date = date, word = word, count = int(keywords_all[date][keyword]) ) dataWordAll.save() """ #insert most keywords 20 sorted_keywords = sorted(keywords.items(), key=lambda x:x[1], reverse = True) for i in range(0,20) : try : word = smart_str(sorted_keywords[i][0]) dataWord = FrequencyWord( chatroom_id = chatroom_id, word = word, count = int(sorted_keywords[i][1]) ) dataWord.save() except : pass #insert moment for week in sent_time : for hour in sent_time[week] : dateTime = FrequencyTime( chatroom_id = chatroom_id, week = int(week), hour = int(hour), count = int(sent_time[week][hour]) ) dateTime.save() if len(sender_list) == 2 : is_one_to_one = 1 intimacy = {} for sender in response_time : rt_average = sum(response_time[sender], timedelta()) / len(response_time[sender]) td_increment( intimacy, sender, " ", rt_average.total_seconds()) #insert intimacy for member in intimacy : for friends in intimacy[member] : dataIntimacy = Intimacy( chatroom_id = chatroom_id, name = unicode(str(member), 'utf-8').encode('utf-8'), target = unicode(str(friends), 'utf-8').encode('utf-8'), count = int(intimacy[member][friends]) ) dataIntimacy.save() #insert each char count for sender in kcount : dataChar = FrequencyChars( chatroom_id = chatroom_id, name = unicode(str(sender), 'utf-8').encode('utf-8') ) try : dataChar.count_char_1 = int(kcount[sender]) except : pass try : dataChar.count_char_2 = int(hcount[sender]) except : pass try : dataChar.count_char_3 = int(ucount[sender]) except : pass dataChar.save() Chatroom.objects.filter(id=chatroom_id).update(complete_datetime=datetime.datetime.now(), is_one_to_one=is_one_to_one) return HttpResponse(myUid) return HttpResponse('Failed to Upload File')
#plt.rc('font', family='AppleGothic') import seaborn as sns # 첫번쨰 커밋 # 두번째 커밋 # third commit # 네번째 커밋 x_data = np.array([ '영희가 사랑하는 강아지 백구를 산책시키고 있다.', '철수가 사랑하는 소 누렁이를 운동시키고 있다.', '영희와 철수는 소와 강아지를 산책 및 운동시키고 있다.' ]) twitter = Twitter() for i, document in enumerate(x_data): nouns = twitter.nouns(document) x_data[i] = ' '.join(nouns) print(x_data) #['영희 사랑 강아지 백구 산책', '철수 사랑 소 누렁이 운동', '영희 철수 소 강아지 산책 및 운동'] vect = TfidfVectorizer() x_data = vect.fit_transform(x_data) cosine_similarity_matrix = (x_data * x_data.T) print(cosine_similarity_matrix.shape) #(3, 3) print(cosine_similarity_matrix) ''' (0, 1) 0.19212485958220318 (0, 2) 0.5605318467638107 (0, 0) 0.9999999999999999 (1, 2) 0.4113054999991637
tw = Twitter() # 트위터 형태소 분석기를 사용함 import sys reload(sys) sys.setdefaultencoding('utf-8') # 찾으려고 하는 상위 n개의 단어(명사) _toFind_ = 30 # 문서 읽기 doc_ko = open('./k_tex2.txt').read() # print(doc_ko) # 명사만 추출 token_ko = tw.nouns(doc_ko) # nltk 활용을 위한 res_ko = nltk.Text(token_ko, name=u'sutuk1') print(len(res_ko.tokens)) # returns number of tokens (document length) print(len(set(res_ko.tokens))) # returns number of unique tokens on_list = res_ko.vocab().most_common(_toFind_) # on_list 는 리스트이다, most_common 이 리스트를 반환하는데 리스트는 튜플로 이루어져있다, 튜플은 첫번째인자로(0번쨰) 유니코드 스트링형을 갖고 두번째인자로(1번째) 몇번 빈출되었는지 # print(list(on_list[1])[0]) 테스트코드