def crawling_pelicana(): results = [] # range하고 끝을 지정안해주면 무한 루프. for page in count(start=1):#range(1, 6): url =f'http://pelicana.co.kr/store/stroe_search.html?page={page}&branch_name=&gu=&si=' print(url) html = collection.crawling(url) bs = BeautifulSoup(html, 'html.parser') tag_table = bs.find('table', attrs={'class':'table mt20'}) tag_tbody = tag_table.find('tbody') tags_tr = tag_tbody.findAll('tr') if len(tags_tr) == 0: break for tag_tr in tags_tr: strings = list(tag_tr.strings) # print(strings) name = strings[1] address = strings[3] sidogu = address.split()[:2] # print(sidogu) results.append((name, address) + tuple(sidogu)) # store for t in results: print(t) table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) print(table['sido'], table['sido']) table['sido'] = table.sido.apply(lambda v: collection.sido_dict.get(v, v)) table['gungu'] = table.sido.apply(lambda v: collection.gungu_dict.get(v, v)) # table = table.reset_index().set_index('index') table.to_csv('{0}/table-pelicana.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_pelicana(): results = [] # collection for page in count(start=1): # page 숫자 1~5페이지까지 크롤링 url = 'http://www.pelicana.co.kr/store/stroe_search.html?branch_name=&gu=&si=&page=%d' % page html = collection.crawling(url) bs = BeautifulSoup(html, 'html.parser') #점진적으로 태그 탐색 tag_table = bs.find('table', attrs={'class': 'table mt20'}) # 테이블 tag_tbody = tag_table.find('tbody') # 테이블 속의 body tags_tr = tag_tbody.findAll('tr') # tr if len(tags_tr) == 0: break for tag_tr in tags_tr: strings = list(tag_tr.strings) name = strings[1] address = strings[3] sidogu = address.split()[:2] results.append((name, address) + tuple(sidogu)) # store for t in results: print(t) table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: collection.sido_dict.get(v, v)) table['gungu'] = table.sido.apply( lambda v: collection.gungu_dict.get(v, v)) table = table.reset_index().set_index('index') table.to_csv('{0}/table_pelica.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True)
def crawling_pelicana(): # collection results = [] for page in range(1, 118): url = f'http://pelicana.co.kr/store/stroe_search.html?page={page}&branch_name=&gu=&si=' html = collection.crawling(url) soup = BeautifulSoup(html, 'html.parser') tag_table = soup.find('table', attrs={'class': 'table mt20'}) tag_tbody = tag_table.find('tbody') tags_tr = tag_tbody.find_all('tr') if len(tags_tr) == 0: break for tag_tr in tags_tr: strings = list(tag_tr.strings) name = strings[1] address = strings[3] sidogu = address.split()[:2] results.append((name, address) + tuple(sidogu)) # store table = pd.DataFrame(results, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: collection.sido_dict.get(v, v)) table['gungu'] = table.sido.apply( lambda v: collection.gungu_dict.get(v, v)) # table = table.reset_index(drop=True).set_index('no') table.to_csv('{0}/table-pelicana.csv'.format(RESULT_DIRECTORY), encoding='UTF-8', mode='w')
import collection import analyze import visualization if __name__ == '__main__': items = [{ 'pagename': 'jtbcnews', 'since': '2018-05-01', 'until': '2018-05-31' }, { 'pagename': 'chosun', 'since': '2018-05-01', 'until': '2018-05-31' }] # collection for item in items: result_file = collection.crawling(**item) item['resultfile'] = result_file # analyze for item in items: print(item['resultfile']) # visualization
def store_nene(data): table = pd.DataFrame(data, columns=['name', 'address', 'sido', 'gungu']) table['sido'] = table.sido.apply(lambda v: collection.sido_dict.get(v, v)) table['gungu'] = table.gungu.apply( lambda v: collection.gungu_dict.get(v, v)) table = table.reset_index().set_index('index') table.to_csv('{0}/table_nene.csv'.format(RESULT_DIRECTORY), encoding='utf-8', mode='w', index=True) if __name__ == '__main__': # oekucabab collection crawling_pelicana() # nene collection collection.crawling( # 한글을 자동으로 인코딩해서 변환해준다. # http://nenechicken.com/subpage/where_list.asp?target_step2=전체&proc_type=step1&target_step1=전체 # http://nenechicken.com/subpage/where_list.asp?target_step2=%EC%A0%84%EC%B2%B4&proc_type=step1&target_step1=%EC%A0%84%EC%B2%B4 # 인코딩된 부분을 %s로 받는다. why? 보기 좋으라고 url= 'http://nenechicken.com/subpage/where_list.asp?target_step2=%s&proc_type=step1&target_step1=%s' % (urllib.parse.quote('전체'), urllib.parse.quote('전체')), proc=proc_nene, store=store_nene)
if __name__ == '__main__': items = [{ "pagename": "jtbcnews", "since": '2017-01-01', "until": '2017-10-17' }, { "pagename": "chosun", "since": '2017-01-01', "until": '2017-10-17' }] # HTTP Error 500: Internal Server Error -> 데이터가 없는것으로 유추 가능 # collection for item in items: result_file = collection.crawling(**item, fetch=True) item['result_file'] = result_file # analysis for item in items: # print(item['result_file'])ㅣ data = analyze.json_to_str(item['result_file'], 'message') item['count'] = analyze.count_word_freq(data) # visualization for item in items: count = item['count'] count_top50 = dict(count.most_common(50)) file_name = '%s_%s_%s' % (item['pagename'], item['since'], item['until'])
until = '2017-10-01' if __name__ == '__main__': items = [{ 'pagename': 'jtbcnews', 'since': '2017-10-01', 'until': '2017-10-17' }, { 'pagename': 'chosun', 'since': '2017-10-01', 'until': '2017-10-17' }] # collection for item in items: resultfile = collection.crawling(**item, fetch=False) item['resultfile'] = resultfile # analysis for item in items: data = analyze.json_to_str(item['resultfile'], 'message') item['count'] = analyze.count_wordfreq(data) # visualization for item in items: count = item['count'] count_t50 = dict(count.most_common(50)) filename = '%s_%s_%s' % (item['pagename'], item['since'], item['until']) visualize.wordcloud(count_t50, filename) visualize.graph_bar(values=list(count_t50.values()),
import urllib import collection def proc_nene(xml): pass def store_nene(data): pass if __name__ == '__main__': # nene collection collection.crawling( url='http://nenechicken.com/subpage/where_list.asp?target_step2=%s&proc_type=step1&target_step1=%s' % (urllib.parse.quote('전체'), urllib.parse.quote('전체')) , proc=proc_nene, store=store_nene)