def crawling_track(url): # 값을 입력할 VO 객체 생성 musicVO = Music_VO() albumVO = Album_VO() artistVO = Artist_VO() # Music_ID 는 링크로부터 채워서 올것! # Music_VO.Music_ID = # bs from html response.... html = cw.crawling(url=url) bs = BeautifulSoup(html, 'html.parser') tag_music_info = bs.find('div', attrs={'class': 'music_info_view'}) # 곡 소개 테이블 summary = tag_music_info.find('div', attrs={'class': 'music_info_cont'}) album_tag = summary.find('tbody').find('a') if album_tag is not None: albumVO.Album_Node = album_tag['href'].strip(" ") albumVO.Album_ID = albumVO.Album_Node.rsplit('/', 1)[1] musicVO.Album_ID = albumVO.Album_ID artist_tag = bs.find('span', attrs={'class': 'artist_txt'}).find('a') if artist_tag != None: artistVO.Artist_Node = artist_tag['href'].strip(" ") artistVO.Artist_ID = artistVO.Artist_Node.rsplit('/', 1)[1] artistVO.Artist_Name = artist_tag.get_text() albumVO.Singer_ID = artistVO.Artist_ID attrs = summary.find('li', attrs={ 'class': 'left_con' }).findAll('p', attrs={'class': 'right'})
def crawling_Artist(um): artistVO = Artist_VO() artistVO.Artist_ID = um.END_POINT artistVO.Artist_Node = '/'.join([um.NODE, str(um.END_POINT)]) artistVO.Group = False html = crawler.crawling(url=um.URL) bs = BeautifulSoup(html, 'html.parser') tag_artist_info = bs.find('div', attrs={'class': 'artist_info'}) if tag_artist_info is not None: singer = tag_artist_info.find('a', attrs={'class': 'song_name'}) if singer is not None: artistVO.Artist_Name = singer.get_text() else: artistVO.Artist_Name = tag_artist_info.find( 'li', attrs={ 'class': 'top_left' }).find('p').get_text().strip() print("############# strip 결과 #############\n", artistVO.Artist_Name, "\n############# strip 결과 #############\n") a = tag_artist_info.find('div', attrs={'class': 'a_info_cont'}) tags = tag_artist_info.findAll('span', attrs={'class': 'right'}) for tag in tags: if tag is not None: text_list = tag.get_text().strip().replace(' ', '').replace( '\r', '').replace('\n', '').replace('\t', '').replace('\xa0', '').split('|') # print(text_list) for text in text_list: if text == '남성' or text == '여성' or text == '혼성': artistVO.Gender = text if text == '그룹': artistVO.Group = True db_session.merge(artistVO) db_session.commit() print(artistVO)
def crawling_artist(id): artistVO = Artist_VO() artistVO.Artist_ID = id artistVO.Artist_Node = '/artist/{0}'.format(id) artistVO.Group = False url = ''.join(['http://www.mnet.com', artistVO.Artist_Node]) html = cw.crawling(url) bs = BeautifulSoup(html, 'html.parser') tag_artist_info = bs.find('div', attrs={'class': 'artist_info'}) if tag_artist_info is not None: singer = tag_artist_info.find('a', attrs={'class': 'song_name'}) if singer is not None: artistVO.Artist_Name = singer.get_text() else: artistVO.Artist_Name = tag_artist_info.find( 'li', attrs={ 'class': 'top_left' }).find('p').get_text().strip() print("############# strip 결과 #############\n", artistVO.Artist_Name, "\n############# strip 결과 #############\n") a = tag_artist_info.find('div', attrs={'class': 'a_info_cont'}) tags = tag_artist_info.findAll('span', attrs={'class': 'right'}) for tag in tags: if tag is not None: text_list = tag.get_text().strip().replace( ' ', '').replace('\r', '').replace('\n', '').replace( '\t', '').replace('\xa0', '').split('|') print(text_list) for text in text_list: if text == '남성' or text == '여성' or text == '혼성': artistVO.Gender = text if text == '그룹': artistVO.Group = True db_session.merge(artistVO) db_session.commit() time.sleep(0.5) # sleep 안주면 200 번째 request 이후 차단됨...
def crawling_mnet_month_chart(url): # crawling_from_chart # mnet monthly chart 로부터 음원 데이터를 긁어오는 과정... # VO 객체들 artistVO = Artist_VO() albumVO = Album_VO() musicVO = Music_VO() html = cw.crawling(url=url) bs = BeautifulSoup(html, 'html.parser') ##################################################################################################################### # VO 값 입력 tag_music_list = bs.find('div', attrs={'class': 'MMLTable jQMMLTable'}) tag_tbody = tag_music_list.find('tbody') tags_tr = tag_tbody.findAll('tr') print( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) for tag_tr in tags_tr: # item_title 태그내 정보들... item_title_tag_td = tag_tr.find('td', attrs={'class': 'MMLItemTitle'}) # 8개 해야된다...... # 음원의 고유 아이디 musicVO.Music_ID = tag_tr.find('td', attrs={ 'class': 'MMLItemCheck' }).find('input')["value"] musicVO.Music_Title = item_title_tag_td.find('a', attrs={ 'class': 'MMLI_Song' }).get_text() album_tag = item_title_tag_td.find( 'a', attrs={'class': 'MMLIInfo_Album'}) artist_tag = item_title_tag_td.find( 'a', attrs={'class': 'MMLIInfo_Artist'}) print(album_tag) print(artist_tag) if album_tag != None: albumVO.Album_Title = album_tag.get_text() albumVO.Album_Node = album_tag["href"].strip(" ") albumVO.Album_ID = int(albumVO.Album_Node.rsplit('/', 1)[1]) musicVO.Album_ID = albumVO.Album_ID if artist_tag != None: artistVO.Artist_Name = artist_tag.get_text() # 객체 및 테이블에 노드 추가 할 것! artistVO.Artist_Node = artist_tag["href"].strip(" ") artistVO.Artist_ID = int( artistVO.Artist_Node.rsplit('/', 1)[1]) albumVO.Singer_ID = artistVO.Artist_ID # #######commit 계속 안하고 한방에 못하는지 알아보고, ORM 객체 내 객체 포함...으로 알아볼 것!!! # 양방향 머시기 하는듯... db_session.merge(artistVO) db_session.commit() db_session.merge(albumVO) db_session.commit() db_session.merge(musicVO) db_session.commit()
from bs4 import BeautifulSoup from db_accessing.VO import Album_VO, Music_VO, Artist_VO from modules.collection import crawler url = 'http://www.mnet.com/track/33801010' html = crawler.crawling(url) bs = BeautifulSoup(html, 'html.parser') tag_music_info = bs.find('div', attrs={'class': 'music_info_view'}) albumVO = Album_VO() musicVO = Music_VO() artistVO = Artist_VO() # 곡 소개 테이블 summary = tag_music_info.find('div', attrs={'class': 'music_info_cont'}) album_tag = summary.find('table').find('a') musicVO.Music_ID = 33801010 if album_tag is not None: albumVO.Album_Node = album_tag['href'].strip(" ") albumVO.Album_ID = int(albumVO.Album_Node.rsplit('/', 1)[1]) musicVO.Album_ID = albumVO.Album_ID artist_tag = bs.find('span', attrs={'class': 'artist_txt'}).find('a') if artist_tag != None: