endTime = time.time() - startTime print('Running analyze : ' + str(endTime)) data = analyze.json_to_str(item.get('resultfile'), 'message') item['count_wordfreq'] = analyze.count_wordfreq(data) # 데이터 시각화(visualize) for item in items: endTime = time.time() - startTime print('Running visualize : ' + str(endTime)) # 분석된 단어들 중에 most 50개만.. count = item['count_wordfreq'] count_m50 = dict(count.most_common(50)) # wordclud, graph bar filename = "%s_%s_%s" % (item['pagename'], item['since'], item['until']) visualize.wordcloud(filename, count_m50) visualize.graph_bar( # 다른 바에도 적용하기위해 함수로 제작 title='%s 빈도 분석' % (item['pagename']), xlabel='단어', ylabel='빈도 수', values=list( count_m50.values()), #, filename) # 딕셔너리니까 리스트 형태로 바꿔서 .. ticks=list(count_m50.keys()), # x, y의 값축(항목축) 지정 showgrid=True, # grid = 격자 filename=filename, # 파일로 저장 할건데, showgraph=False # 그래프로도 바로 보여줄것인가? )
#데이터 분석 for item in CONFIG['items']: # print(item['resultfile']) data = analyze.json_to_str(item['resultfile'], 'message') print(data) item['count_wordfreq'] = analyze.count_wordfreq(data) print(item['count_wordfreq']) #데이터 시각화(visualize) for item in CONFIG['items']: count = item['count_wordfreq'] count_m50 = dict(count.most_common(50)) filename = '%s_%s_%s' % (item['page_name'], item['since'], item['until']) visualize.wordcloud(filename, count_m50, CONFIG['common']['result_vidualization_dir']) visualize.graph_bar( title='%s 빈도 분석' % (item['page_name']), xlabel='단어', ylabel='빈도수', #딕셔너리 형태이므로 형변환 한다. values=list(count_m50.values()), #value 값 ticks=list(count_m50.keys()), #축값을 key에서 받아옴 showgrid=True, #그래프에 격자를 그릴지 여부 filename=filename, showgraph=False, #팝업윈도우를 띄울지 여부 result_vidualization_dir=CONFIG['common'] ['result_vidualization_dir'])
'pagename': 'jtbcnews', 'since': '2017-01-01', 'until': '2017-10-16' }, { 'pagename': 'chosun', 'since': '2017-01-01', 'until': '2017-10-16' }] # collection for item in items: resultfile = collection.crawling(**item) item['resultfile'] = resultfile # analysis for item in items: data = analyze.json_to_str(item['resultfile'], 'message') item['count'] = analyze.count_wordfreq(data) # visualization for item in items: count = item['count'] count_t50 = dict(count.most_common(50)) filename = '%s_%s_%s.png' % (item['pagename'], item['since'], item['until']) visualize.graph_bar(values=list(count_t50.values()), ticks=list(count_t50.keys()), showgrid=True, filename=filename, showgraph=False)
""" for country in CONFIG['countries']: collect.crawlling_foreigner_visitor(country, **CONFIG['common']) """ resultfiles['foreign_visitor'] = [] for country in CONFIG['countries']: rf = collect.crawlling_foreigner_visitor(country, **CONFIG['common']) resultfiles['foreign_visitor'].append(rf) #print(resultfiles) #1. analysis and visualize #result_analysis = analyze.analysis_correlation(resultfiles) #print(len(resultfiles)) #print(result_analysis) #visualize.graph_scatter(result_analysis) #2. analysis and visualize result_analysis = analyze.analysis_correlation_by_tourspot(resultfiles) #print(type(result_analysis)) visualize.graph_bar(result_analysis) #graph_table = pd.DataFrame(result_analysis, colums=['tourspot','r_중국','r_일본','r_미국']) #graph_table = graph_table.set_index('tourspot') #graph_table.plot(kind='bar') #plt.show()
# 데이터 분석 (analyze) # for item in items: # print(item['resultfile']) # json데이터를 str로 for item in CONFIG['items']: data = analyze.json_to_str(item['resultfile'], 'message') # 본문 내용을 str로 변환 # print(data) item['count_wordfreq'] = analyze.count_wordfteq( data) # item['word_freq'] 시각화용 # print(item['count_wordfreq']) # 데이터 시각화 (visualize) for item in CONFIG['items']: count = item['count_wordfreq'] count_m50 = dict(count.most_common(50)) filename = '%s_%s_%s' % (item['pagename'], item['since'], item['until']) visualize.wordcloud(filename, count_m50, CONFIG['result_directory_v']) visualize.graph_bar(title='%s 빈도 분석' % (item['pagename']), result_directory_v=CONFIG['result_directory_v'], xlabel='단어', ylabel='빈도 수', values=list(count_m50.values()), ticks=list(count_m50.keys()), showgrid=False, filename=filename, showgraph=False)
#데이터 시각화(visualize) for item in items: count = item[ 'count_wordfreq'] #count객체를 빼내서 'count_wordfreq': Counter({'빵': 28, '문재인': 27, '년': 20, '편의점': 20, '사람': 1 count.most_common(50) # list(tuple) 랭킹 50위까지 순서대로 count_m50 = dict(count.most_common( 50)) # dic 형태로 변경 : {'오늘': 126, '일': 110, '기사': 107, filename = '%s_%s_%s' % (item['pagename'], item['since'], item['until']) visualize.wordcloud(filename, count_m50, CONFIG['common']['result_visual_dir']) ''' [{'color': (16, 176, 94), 'size': 92, 'tag': '오늘'}, {'color': (100, 64, 176), 'size': 82, 'tag': '일'}, ...color rgb 값 ''' visualize.graph_bar( CONFIG['common']['result_visual_dir'], title='%s 빈도분석 ' % (item['pagename']), xlabel='단어', ylabel='빈도수', values=list( count_m50.values()), #y축값: dic을 list로 변환,그래프 파일에저장후출력예정 ticks=list(count_m50.keys()), # ticks:그래프 x축 문자열 showgrid=True, #그리드로표시할지여부 filename=filename, # 저장시 그래프의이미지파일로 저장 showgraph=False #그래프바로보여줄지여부(일단 파일로만 저장) )
from config import CONFIG if __name__ == '__main__': resultfiles = {} resultfiles['tourspot_visitor'] = [] resultfiles['foreign_visitor'] = [] # collection returnedFilename = collect.crawling_tourspot_visitor( district=CONFIG['district'], **CONFIG['common']) resultfiles['tourspot_visitor'].append(returnedFilename) for country in CONFIG['countries']: returnedFilename = collect.crawling_foreign_visitor(country=country, **CONFIG['common']) resultfiles['foreign_visitor'].append(returnedFilename) # analysis results = analyze.analysis_correlation(resultfiles=resultfiles) # visualize for result in results: print(result) visualize.graph_scatter(results, showgraph=False) # 2. analysis & vsualization result_analysis = analyze.analysis_correlation_by_tourspot( resultfiles=resultfiles) print(result_analysis) visualize.graph_bar(result_analysis, showgraph=True)
}] # 데이터 수집(collection) for item in items: resultfile = collect.crawling(**item, fetch=False) item['resultfile'] = resultfile # 데이터 분석(analyze) for item in items: data = analyze.json_to_str(item['resultfile'], 'message') item['count_wordfreq'] = analyze.count_wordfreq(data) print(item['count_wordfreq']) # 데이터 시각화(visualize) for item in items: count = item['count_wordfreq'] count_m50 = dict(count.most_common(50)) filename = "%s_%s_%s" % (item['pagename'], item['since'], item['until']) visualize.wordcloud(filename, count_m50) visualize.graph_bar( title='%s 빈도분석' % (item['pagename']), xlabel='단어', ylabel='빈도', values=list(count_m50.values()), ticks=list(count_m50.keys()), showgrid=False, # 그리드 그리기 filename=filename, showgraph=False)
# def crawling(pagename, since, until, fetch=True, result_directory=''): if __name__ == '__main__': # 데이터 수집 (collect) for item in CONFIG['items']: resultfile = collect.crawling(**item, **CONFIG['common']) item['resultfile'] = resultfile # 데이터 분석 (analyze) for item in CONFIG['items']: data = analyze.json_to_str(item['resultfile'], 'message') item['count_wordfreq'] = analyze.count_wordfreq(data) # 데이터 시각화 (visualize) for item in CONFIG['items']: count = item['count_wordfreq'] count_m50 = dict(count.most_common(50)) # dict 형태로 filename = '%s_%s_%s' % (item['pagename'], item['since'], item['until']) # 이미지 저장할 파일 visualize.wordcloud(filename, count_m50) # def wordcloud(filename, wordfreq): pytagcloud visualize.graph_bar( title='%s 빈도 분석' % (item['pagename']), xlabel='단어', ylabel='빈도수', values=list(count_m50.values()), # 그래프의 y 값 : count_m50 딕셔너리의 values ticks=list(count_m50.keys()), # 그래프의 x 값 : count_m50 딕셔너리의 keys showgrid=False, # 격자 여부 filename=filename, # 파일로 저장 showgraph=False) # pop-up window 띄울지 여부
item['count_wordfreq'] = analyze.count_wordfreq(data) # print(data) print(item['count_wordfreq']) # item['count_wordfreq'] = analyze.count_wordfreq(data) # 위의 count_wordfreq는 빈도분석 # collect.crawling( # "jtbcnews", # "2017-01-01", # "2018-12-31") # 데이터 시각화(visualize) for item in items: count = item['count_wordfreq'] count_m50 = dict(count.most_common(50)) filename = '%s_%s_%s' % (item['pagename'], item['since'], item['until']) visualize.wordcloud(filename, count_m50) visualize.graph_bar( title='%s 빈도 분석' % (item['pagename']), xlabel='단어', ylabel='빈도수', values=list(count_m50.values()), ticks=list(count_m50.keys()), showgrid=False, #grid가 싫으면(안이쁘면) false로 두면된다. filename=filename, #디폴트(저장하기위해서) showgraph=False #윈도 창을 뜨게 할건지 말건지 표시 ) #6/18 그래프 bar 완성하기