def crawling_track(url): # 값을 입력할 VO 객체 생성 musicVO = Music_VO() albumVO = Album_VO() artistVO = Artist_VO() # Music_ID 는 링크로부터 채워서 올것! # Music_VO.Music_ID = # bs from html response.... html = cw.crawling(url=url) bs = BeautifulSoup(html, 'html.parser') tag_music_info = bs.find('div', attrs={'class': 'music_info_view'}) # 곡 소개 테이블 summary = tag_music_info.find('div', attrs={'class': 'music_info_cont'}) album_tag = summary.find('tbody').find('a') if album_tag is not None: albumVO.Album_Node = album_tag['href'].strip(" ") albumVO.Album_ID = albumVO.Album_Node.rsplit('/', 1)[1] musicVO.Album_ID = albumVO.Album_ID artist_tag = bs.find('span', attrs={'class': 'artist_txt'}).find('a') if artist_tag != None: artistVO.Artist_Node = artist_tag['href'].strip(" ") artistVO.Artist_ID = artistVO.Artist_Node.rsplit('/', 1)[1] artistVO.Artist_Name = artist_tag.get_text() albumVO.Singer_ID = artistVO.Artist_ID attrs = summary.find('li', attrs={ 'class': 'left_con' }).findAll('p', attrs={'class': 'right'})
def crawling_Artist(um): artistVO = Artist_VO() artistVO.Artist_ID = um.END_POINT artistVO.Artist_Node = '/'.join([um.NODE, str(um.END_POINT)]) artistVO.Group = False html = crawler.crawling(url=um.URL) bs = BeautifulSoup(html, 'html.parser') tag_artist_info = bs.find('div', attrs={'class': 'artist_info'}) if tag_artist_info is not None: singer = tag_artist_info.find('a', attrs={'class': 'song_name'}) if singer is not None: artistVO.Artist_Name = singer.get_text() else: artistVO.Artist_Name = tag_artist_info.find( 'li', attrs={ 'class': 'top_left' }).find('p').get_text().strip() print("############# strip 결과 #############\n", artistVO.Artist_Name, "\n############# strip 결과 #############\n") a = tag_artist_info.find('div', attrs={'class': 'a_info_cont'}) tags = tag_artist_info.findAll('span', attrs={'class': 'right'}) for tag in tags: if tag is not None: text_list = tag.get_text().strip().replace(' ', '').replace( '\r', '').replace('\n', '').replace('\t', '').replace('\xa0', '').split('|') # print(text_list) for text in text_list: if text == '남성' or text == '여성' or text == '혼성': artistVO.Gender = text if text == '그룹': artistVO.Group = True db_session.merge(artistVO) db_session.commit() print(artistVO)
def crawling_artist(id): artistVO = Artist_VO() artistVO.Artist_ID = id artistVO.Artist_Node = '/artist/{0}'.format(id) artistVO.Group = False url = ''.join(['http://www.mnet.com', artistVO.Artist_Node]) html = cw.crawling(url) bs = BeautifulSoup(html, 'html.parser') tag_artist_info = bs.find('div', attrs={'class': 'artist_info'}) if tag_artist_info is not None: singer = tag_artist_info.find('a', attrs={'class': 'song_name'}) if singer is not None: artistVO.Artist_Name = singer.get_text() else: artistVO.Artist_Name = tag_artist_info.find( 'li', attrs={ 'class': 'top_left' }).find('p').get_text().strip() print("############# strip 결과 #############\n", artistVO.Artist_Name, "\n############# strip 결과 #############\n") a = tag_artist_info.find('div', attrs={'class': 'a_info_cont'}) tags = tag_artist_info.findAll('span', attrs={'class': 'right'}) for tag in tags: if tag is not None: text_list = tag.get_text().strip().replace( ' ', '').replace('\r', '').replace('\n', '').replace( '\t', '').replace('\xa0', '').split('|') print(text_list) for text in text_list: if text == '남성' or text == '여성' or text == '혼성': artistVO.Gender = text if text == '그룹': artistVO.Group = True db_session.merge(artistVO) db_session.commit() time.sleep(0.5) # sleep 안주면 200 번째 request 이후 차단됨...
def crawling_album(um=UrlMaker()): # UrlMaker 객체를 매개변수로 넘겨받아서 1페이지를 크롤링 albumVO = Album_VO() albumVO.Album_ID = um.END_POINT albumVO.Album_Node = '/'.join([um.NODE, str(um.END_POINT)]) # bs from html response.... html = cw.crawling(url=um.URL) bs = BeautifulSoup(html, 'html.parser') tag_Album_info = bs.find('div', attrs={'class': 'album_info'}) if tag_Album_info is not None: # 아래는 태그자체가 앨범타이틀로 들어갈 수 있으니 주의할 것! 확인 요망 albumVO.Album_Title = tag_Album_info.find('li', attrs={'class': 'top_left'}) if albumVO.Album_Title is not None: albumVO.Album_Title = albumVO.Album_Title.find( 'p').get_text().strip() album_info = tag_Album_info.find('div', {'class': 'a_info_cont'}) summary = album_info.find('dl').find('dd').findAll('p') for tag in summary: left_span = tag.find('span', attrs={'class', 'left'}).get_text() right_span = tag.find('span', attrs={'class', 'right'}) if left_span == '아티스트': right_span_a_tag = right_span.find('a') # print(right_span_a_tag) if right_span_a_tag is not None: albumVO.Singer_ID = int( right_span_a_tag['href'].strip().rsplit('/', 1)[1]) else: albumVO.Singer_ID = Artist_VO.query.filter_by( Artist_Name='Various Artists').first().Artist_ID # albumVO.Singer_ID = 10000000 if left_span == '발매일': ymd = [1, 1, 1] ymd_data = list( map(lambda x: int(x), right_span.get_text().split('.'))) for i in range(len(ymd_data)): # 웹에 기록된 length 만큼 돌면서 수행 if i == 0 and 0 < ymd_data[i]: ymd[i] = ymd_data[i] elif i == 1 and 0 < ymd_data[i] < 13: ymd[i] = ymd_data[i] elif i == 2 and 0 < ymd_data[i] < 32: try: datetime(ymd[0], ymd[1], ymd[2]) except: ymd[i] = 1 albumVO.Release_Date = datetime(ymd[0], ymd[1], ymd[2]) if left_span == '기획사' or left_span == '레이블': albumVO.Agency = right_span.get_text().strip() if left_span == '유통사': albumVO.Distributor = right_span.get_text().strip() descriptions = album_info.find('div', attrs={'class', 'text_slider'}) # print(descriptions) if descriptions is not None: descriptions = descriptions.findAll('p') desc = str(descriptions[len(descriptions) - 1].get_text()) # print(desc) # print("앨범 설명 길이 len() : ", len(desc.encode('utf-8')), file=sys.stderr) # print("Desc 길이 : ", Album_VO.Description.type.length, file=sys.stderr) if len(desc.encode('utf-8')) <= Album_VO.Description.type.length: albumVO.Description = preprocessing_string(desc) print("앨범 설명 : ", albumVO.Description) try: # print(albumVO, file=sys.stderr) db_session.merge(albumVO) db_session.commit() # if albumVO.Album_ID%5 ==0: # cw_log({albumVO.Album_ID : 'SUCCESS[Completely]'}) except InternalError: db_session.rollback() try: import re pattern = re.compile( u'[^ ~`!@#$%^&*()_\-+={\[}\]:<.>/?\'\"\n\ta-zA-Z0-9\u3131-\u3163\uac00-\ud7a3]+' ) # 한글 키보드 특문 영어 숫자 albumVO.Description = re.sub(pattern, ' ', albumVO.Description) print("앨범 설명 : ", albumVO.Description) db_session.merge(albumVO) db_session.commit() cw_log({albumVO.Album_ID: 'SUCCESS[RE.Compile] - Desc'}) except: print(" 완전 rollback", file=sys.stderr) cw_log({albumVO.Album_ID: 'FAILURE - Desc'}) db_session.rollback() albumVO.Description = None db_session.merge(albumVO) db_session.commit() finally: print(albumVO)
def crawling_mnet_month_chart(url): # crawling_from_chart # mnet monthly chart 로부터 음원 데이터를 긁어오는 과정... # VO 객체들 artistVO = Artist_VO() albumVO = Album_VO() musicVO = Music_VO() html = cw.crawling(url=url) bs = BeautifulSoup(html, 'html.parser') ##################################################################################################################### # VO 값 입력 tag_music_list = bs.find('div', attrs={'class': 'MMLTable jQMMLTable'}) tag_tbody = tag_music_list.find('tbody') tags_tr = tag_tbody.findAll('tr') print( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) for tag_tr in tags_tr: # item_title 태그내 정보들... item_title_tag_td = tag_tr.find('td', attrs={'class': 'MMLItemTitle'}) # 8개 해야된다...... # 음원의 고유 아이디 musicVO.Music_ID = tag_tr.find('td', attrs={ 'class': 'MMLItemCheck' }).find('input')["value"] musicVO.Music_Title = item_title_tag_td.find('a', attrs={ 'class': 'MMLI_Song' }).get_text() album_tag = item_title_tag_td.find( 'a', attrs={'class': 'MMLIInfo_Album'}) artist_tag = item_title_tag_td.find( 'a', attrs={'class': 'MMLIInfo_Artist'}) print(album_tag) print(artist_tag) if album_tag != None: albumVO.Album_Title = album_tag.get_text() albumVO.Album_Node = album_tag["href"].strip(" ") albumVO.Album_ID = int(albumVO.Album_Node.rsplit('/', 1)[1]) musicVO.Album_ID = albumVO.Album_ID if artist_tag != None: artistVO.Artist_Name = artist_tag.get_text() # 객체 및 테이블에 노드 추가 할 것! artistVO.Artist_Node = artist_tag["href"].strip(" ") artistVO.Artist_ID = int( artistVO.Artist_Node.rsplit('/', 1)[1]) albumVO.Singer_ID = artistVO.Artist_ID # #######commit 계속 안하고 한방에 못하는지 알아보고, ORM 객체 내 객체 포함...으로 알아볼 것!!! # 양방향 머시기 하는듯... db_session.merge(artistVO) db_session.commit() db_session.merge(albumVO) db_session.commit() db_session.merge(musicVO) db_session.commit()
from datetime import datetime from bs4 import BeautifulSoup from db_accessing.VO import Album_VO, Music_VO, Artist_VO from modules.collection import crawler url = 'http://www.mnet.com/track/33801010' html = crawler.crawling(url) bs = BeautifulSoup(html, 'html.parser') tag_music_info = bs.find('div', attrs={'class': 'music_info_view'}) albumVO = Album_VO() musicVO = Music_VO() artistVO = Artist_VO() # 곡 소개 테이블 summary = tag_music_info.find('div', attrs={'class': 'music_info_cont'}) album_tag = summary.find('table').find('a') musicVO.Music_ID = 33801010 if album_tag is not None: albumVO.Album_Node = album_tag['href'].strip(" ") albumVO.Album_ID = int(albumVO.Album_Node.rsplit('/', 1)[1]) musicVO.Album_ID = albumVO.Album_ID artist_tag = bs.find('span', attrs={'class': 'artist_txt'}).find('a')
def crawling_album(um=UrlMaker()): # UrlMaker 객체를 매개변수로 넘겨받아서 1페이지를 크롤링 albumVO = Album_VO() albumVO.Album_ID = um.END_POINT albumVO.Album_Node = '/'.join([um.NODE, str(um.END_POINT)]) # bs from html response.... html = cw.crawling(url=um.URL) bs = BeautifulSoup(html, 'html.parser') tag_Album_info = bs.find('div', attrs={'class': 'album_info'}) if tag_Album_info is not None: # 아래는 태그자체가 앨범타이틀로 들어갈 수 있으니 주의할 것! 확인 요망 albumVO.Album_Title = tag_Album_info.find('li', attrs={'class': 'top_left'}) if albumVO.Album_Title is not None: albumVO.Album_Title = albumVO.Album_Title.find( 'p').get_text().strip() album_info = tag_Album_info.find('div', {'class': 'a_info_cont'}) summary = album_info.find('dl').find('dd').findAll('p') for tag in summary: left_span = tag.find('span', attrs={'class', 'left'}).get_text() right_span = tag.find('span', attrs={'class', 'right'}) if left_span == '아티스트': right_span_a_tag = right_span.find('a') if right_span_a_tag is not None: albumVO.Singer_ID = int( right_span_a_tag['href'].strip().rsplit('/', 1)[1]) else: albumVO.Singer_ID = Artist_VO.query.filter_by( Artist_Name='Various Artists').first().Artist_ID # albumVO.Singer_ID = 10000000 if left_span == '발매일': ymd = [1, 1, 1] ymd_data = list( map(lambda x: int(x), right_span.get_text().split('.'))) for i in range(len(ymd_data)): if i == 0 and 0 < ymd_data[i]: ymd[i] = ymd_data[i] elif i == 1 and 0 < ymd_data[i] < 13: ymd[i] = ymd_data[i] elif i == 2 and 0 < ymd_data[i] < 32: ymd[i] = ymd_data[i] albumVO.Release_Date = datetime(ymd[0], ymd[1], ymd[2]) if left_span == '기획사' or left_span == '레이블': albumVO.Agency = right_span.get_text().strip() if left_span == '유통사': albumVO.Distributor = right_span.get_text().strip() descriptions = album_info.find('div', attrs={'class', 'text_slider'}) if descriptions is not None: descriptions = descriptions.findAll('p') if len(descriptions) == 2: desc = str(descriptions[1]) else: desc = descriptions.get_text().replace('\n', '').replace('\t', '') desc = desc.replace('<p class="txt">', '').replace('<br/>', '\n').replace('</p>', '').strip() albumVO.Description = desc db_session.merge(albumVO)
if id % 5 == 0: db_session.commit() collecting_album() for id in range(33801010, 33801015): musicVO = Music_VO() musicVO.Music_ID = id # bs from html response.... musicVO.Music_Node = '/track/{0}'.format(id) html = cw.crawling(url="http://www.mnet.com%s" % musicVO.Music_Node) bs = BeautifulSoup(html, 'html.parser') tag_music_info = bs.find('div', attrs={'class': 'music_info_view'}) if tag_music_info is not None: # 곡 소개 테이블 summary = tag_music_info.find('div', attrs={'class': 'music_info_cont'}) album_tag = summary.find('table').find('a') if album_tag is not None: albumVO.Album_Node = album_tag['href'].strip(" ") musicVO.Album_ID = int(albumVO.Album_Node.rsplit('/', 1)[1]) albumVO.Album_ID = int(albumVO.Album_Node.rsplit('/', 1)[1])
def crawling_track(um): musicVO = Music_VO() musicVO.Music_ID = um.END_POINT musicVO.Music_Node = '/'.join([um.NODE, str(um.END_POINT)]) html = cw.crawling(url=um.URL) bs = BeautifulSoup(html, 'html.parser') tag_music_info = bs.find('div', attrs={'class': 'music_info_view'}) # 곡 정보가 존재하지 않는 페이지(없는 곡입니다) ....가 아닌 경우에만 수행 경우...... if tag_music_info is not None: # 곡 소개 테이블 summary = tag_music_info.find('div', attrs={'class': 'music_info_cont'}) album_tag = summary.find('table').find('a') if album_tag is not None: musicVO.Album_Node = album_tag['href'].strip(" ") musicVO.Album_ID = int(musicVO.Album_Node.rsplit('/', 1)[1]) # attrs = summary.find('li', attrs={'class': 'left_con'}).findAll('p', attrs={'class': 'right'}) musicVO.Music_Title = tag_music_info.find('li', attrs={'class': 'top_left'}) if musicVO.Music_Title is not None: musicVO.Music_Title = musicVO.Music_Title.find( 'p').get_text().strip() try: left_attrs = summary.find('li', attrs={ 'class': 'left_con' }).findAll('p', attrs={'class': 'left'}) right_attrs = summary.find('li', attrs={ 'class': 'left_con' }).findAll('p', attrs={'class': 'right'}) except AttributeError: attrs_list = bs.find('dd', attrs={'class': 'con'}) left_attrs = attrs_list.find('li', attrs={ 'class': 'left_con' }).findAll('p', attrs={'class': 'left'}) right_attrs = attrs_list.find('li', attrs={ 'class': 'left_con' }).findAll('p', attrs={'class': 'right'}) for i in range(0, len(left_attrs)): if left_attrs[i].get_text().strip() == '음악장르': musicVO.Genre = right_attrs[i].get_text().strip() line_info = bs.findAll('div', attrs={'class': 'line_info'}) lyric = line_info[0].find('li', attrs={'id': 'lyricsText'}) if lyric is not None: buffer = lyric.get_text().replace('\n', '').replace('\t', '').replace( '<br/>', '\n').strip() # 54187 때문에 포함...특문을 아예 제거 해야하는 가??? # 특문을 포함하되 몇몇개 사용되는 것들로?? if '</li>' in buffer: buffer = buffer.split('</li>', 1)[0] else: pass print('버퍼 : ', buffer, len(buffer.encode('utf-8')), file=sys.stderr) if len(buffer.encode('utf-8')) <= Music_VO.Lyrics.type.length: musicVO.Lyrics = buffer if len(line_info) > 1: staffs = line_info[1].findAll('ul', attrs={'class': 'con2'}) else: staffs = None if staffs is not None: for staff in staffs: lyricists = '' if staff.find('li', attrs={ 'class': 'title' }).get_text().strip() == '작사': lyricists = staff.findAll('a') if len(lyricists) != 0: res = '' for lyricist in lyricists: res = ','.join([ res, lyricist['href'].strip().rsplit('/', 1)[1] ]) musicVO.Lyricist_ID = res.split(',', 1)[1] if staff.find('li', attrs={ 'class': 'title' }).get_text().strip() == '작곡': comporsers = staff.findAll('a') if len(comporsers) != 0: res = '' for comporser in comporsers: res = ','.join([ res, comporser['href'].strip().rsplit('/', 1)[1] ]) musicVO.Composer_ID = res.split(',', 1)[1] try: db_session.merge(musicVO) db_session.commit() except InternalError: db_session.rollback() try: import re pattern = re.compile( u'[^ ~`!@#$%^&*()_\-+={\[}\]:<.>/?\'\"\n\ta-zA-Z0-9\u3131-\u3163\uac00-\ud7a3]+' ) # 한글 키보드 특문 영어 숫자 musicVO.Lyrics = re.sub(pattern, ' ', musicVO.Lyrics) db_session.merge(musicVO) db_session.commit() cw_log({musicVO.Music_ID: 'SUCCESS[RE.Compile] - Lirics'}) except: print(" 완전 rollback", file=sys.stderr) db_session.rollback() musicVO.Description = None db_session.merge(musicVO) db_session.commit() cw_log({musicVO.Music_ID: 'FAILURE - Lirics'}) print('저장된 가사 : ', musicVO.Lyrics, file=sys.stderr)