def dump_csv(d, fn, headers=None): debug.log("writing csv to file: " + fn) with open(fn, "w+") as file: writer = csv.writer(file, delimiter=',') if headers: writer.writerow(headers) for k, v in tqdm(d.items()): writer.writerow([k, v])
def find_youtube_by_api(query, youtube_api_key): base_url = "https://www.googleapis.com/youtube/v3/search?" req_url = base_url + "part=snippet&q=" + urllib.parse.quote(query) + "&key=" + youtube_api_key + "&maxResults=10" debug.log(req_url) json_string = requests.get(req_url).text # debug.log(json_string) data_list = json.loads(json_string)['items'] # debug.log(data_list) watch_urls = [] videoIds = [] for content in data_list: if content['id']['kind'] == 'youtube#video': videoId = content['id']['videoId'] # debug.log(content) videoIds.append(videoId) watch_urls.append('https://www.youtube.com/watch?v=' + videoId) # soup = BeautifulSoup(response, "html.parser") # # debug.log(soup) # watch_urls = [] # for link in soup.find_all('h3', {'class':'yt-lockup-title'}): # watch_urls.append(base_url + link.find('a').attrs['href']) # # if len(watch_urls) < 1: # # debug.log(soup) return watch_urls, videoIds
def getSongInfoOfMelon(music_record): soupArtist = music_record.find('div', {'class':'ellipsis rank02'}) soupTitle = music_record.find('div', {'class':'ellipsis rank01'}) soupSongInfo = music_record.find('a', {'class':'btn btn_icon_detail'}) soupAlbumInfo = music_record.find('div', {'class':'ellipsis rank03'}) # debug.log('=========') # debug.log('soupArtist') artist = '' # debug.log(soupArtist) artistCount = 0 for art in soupArtist.find('span', {'class':'checkEllipsis'}).find_all('a'): if artistCount > 0: artist += ',' artist += art.contents[0] artistCount += 1 # debug.log(artist) # debug.log('soupTitle') # debug.log(soupTitle) if soupTitle.find('a') == None: title = soupTitle.find('span', {'class':'fc_lgray'}).contents[0] else: title = soupTitle.find('a').contents[0] # debug.log(title) # debug.log('soupSongInfo') # debug.log(soupSongInfo) songID = soupSongInfo['href'].replace('javascript:melon.link.goSongDetail(\'', '').replace('\');', '') # debug.log(songID) # debug.log('soupAlbumInfo') # debug.log(soupAlbumInfo) albumID = soupAlbumInfo.find('a')['href'].replace('javascript:melon.link.goAlbumDetail(\'', '').replace('\');', '') # debug.log(albumID) debug.log('parsed the music detail (artist: {}, title: {}, songID: {}, albumID: {})'.format(artist, title, songID, albumID)) ''' links = music_record.find_all('a') if len(links) < 4: return None artist = links[3].contents[0] title = links[2].contents[0] songID = links[1]['href'].replace('javascript:melon.link.goSongDetail(\'', '').replace('\');', '') albumID = links[5]['href'].replace('javascript:melon.link.goAlbumDetail(\'', '').replace('\');', '') if len(albumID) > 10: for link in links: debug.log(link) debug.log(albumID) ''' image = music_record.find('img') coverImageURL = image['src'].split('.jpg')[0] + '.jpg' coverImgFile = downloadImageFromMelon(coverImageURL, songID) lyric = getLyricFromMelon(songID) return artist, title, songID, coverImgFile, lyric, albumID
def download_audio_from_youtube(url, output_dir, strQuery, music_reporter): debug.log('\'{}\' is downloading from \'{}\'...'.format(strQuery, url)) if music_reporter != None: music_reporter.updateMusic(strQuery, url) yt = YouTube(url) filename = convertQueryToFilename(strQuery) audio_list = yt.streams.filter(only_audio=True).all() if audio_list == []: audio_list = yt.streams.filter().all() for stream in audio_list: # print(stream) if stream.mime_type.find('mp4') >= 0: stream.download(output_dir, filename) break; return filename
def find_youtube(query): base_url = 'https://www.youtube.co.kr' req_url = base_url + '/results?search_query=' + urllib.parse.quote(query) debug.log(req_url) response = http.getHTMLDocument(req_url) # debug.log(response) soup = BeautifulSoup(response, "html.parser") # debug.log(soup) watch_urls = [] for link in soup.find_all('h3', {'class':'yt-lockup-title'}): watch_urls.append(base_url + link.find('a').attrs['href']) # if len(watch_urls) < 1: # debug.log(soup) return watch_urls
def getSearchList(artist, title): query = 'q={}+-+{}'.format(urllib.parse.quote(artist), urllib.parse.quote(title)) url = 'http://www.melon.com/search/total/index.htm?{}'.format(query) debug.log("send the request to melon: [{}]".format(url)) content = http.getHTMLDocument(url) soup = BeautifulSoup(content, "html.parser") if soup.find('div', {'class':'section_no_data'}) == None: song_table = soup.find_all('div', {'class':'tb_list d_song_list songTypeOne'})[-1] song_list = song_table.find_all('tr') music_list = [] for song_record in song_list: soupTitle = song_record.find('a', {'class':'fc_gray'}) if soupTitle != None: artist, title, songID, albumInfo = getSongInfoFromMelonOfSearch(song_record) music_list.append({'artist':artist, 'title':title, 'songID':songID, 'albumInfo':albumInfo}) # print('{}. artist: {} | title: {} | album: {}'.format(count, artist, title, albumInfo)) count = 0 for item in music_list: count += 1 print('[{}] artist: {} | title: {} | album: {}'.format(count, item['artist'], item['title'], item['albumInfo'])) selected_num = -1 while selected_num < 0 or selected_num > count: try: selected_num = int(input( "Please choose the number(1<=NUM<={}) of music to download (input '0' if you want to exit): ".format( count))) except ValueError: selected_num = -1 continue if selected_num == 0: return if selected_num < 0 or selected_num > count: print('Input number is out of range (0<=NUM<={}). Try to input again.'.format(count)) print("\n[{}]({}-{}<{}>) is selected.".format(selected_num, music_list[selected_num - 1]['artist'], music_list[selected_num - 1]['title'], music_list[selected_num - 1]['albumInfo'])) return music_list[selected_num - 1]['songID'] else: print('There is no data ({}-{} couldn\'t be found in Melon.)'.format(artist, title)) return None
def getAlbumInfoFromMelon(melon_albumID): if melon_albumID == None: return None base_url = 'http://www.melon.com/album/detail.htm?albumId=' url = base_url + melon_albumID content = http.getHTMLDocument(url) soup = BeautifulSoup(content, "html.parser") soupAlbumName = soup.find('div', {'class':'song_name'}) albumName = soupAlbumName.contents[-1].replace('\r\n\t\t\t\t\t\t\t\t\t', '').replace('\t', '') debug.log('Getting album information of \'{}\'(id:{})'.format(albumName, melon_albumID)) info = soup.find('dl', {'class':'list'}) if info == None: return None dd_list = info.find_all('dd') pub_date = dd_list[0].contents[0] genre = dd_list[1].contents[0] publisher = dd_list[2].contents[0] copyright = dd_list[3].contents[0] return {'album_name':albumName, 'pub_date':pub_date, 'genre':genre, 'publisher':publisher, 'copyright':copyright}
def getHTMLDocument(url, autoRetry=True): listAgent = [ 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36' ] retryDelay = [0.1, 0.5, 1, 2, 5, 10, 30, 60, 180, 300] conLoop = True agentCount = 0 delayCount = 0 while (conLoop): opener = urllib.request.build_opener() opener.addheaders = [ ('Host', 'www.melon.com'), ('Connection', 'Keep-Alive'), # ('Upgrade-Insecure-Requests', '1'), ('User_agent', listAgent[agentCount]), ('X-Requested-With', 'XMLHttpRequest'), ('Accept', '*/*'), # ('Accept-Encoding', 'gzip, deflate'), ('Accept-Language', 'en-US,en;q=0.9,ko-KR;q=0.8,ko;q=0.7'), ("Content-Type", "application/x-www-form-urlencoded;charset=utf-8"), ('Cookie', 'SCOUTER=z39vkmg7pd9j91; PCID=15250529259645155634188; WMONID=GxTfcjDmib7; POC=WP10' ) ] try: html = opener.open(url) except ConnectionResetError as e: if autoRetry: debug.log('Connection denied from \'{}\''.format(url)) debug.log( 'Try again using another header after {}sec...'.format( retryDelay[delayCount])) agentCount = (agentCount + 1) % len(listAgent) time.sleep(retryDelay[delayCount]) delayCount = (delayCount + 1) % len(retryDelay) else: debug.log('Document couldn\'t be get from {}'.format(url)) return None else: conLoop = False return html.read()
def print(*msgs, end='\n', indent='default', flush=True): log(*msgs, sep=' ', end=end, indent=indent, out=sys.stdout, debug=False) if flush: sys.stdout.flush()
def handle(self, institution_name = YOUR_INSTITUTION, **options): verbosity = int(options.get('verbosity', 0)) if verbosity > 1: debug.DEBUG = True # Create an error log debug.errorlog_start('scan_itunes') # Some basic error checking if institution_name is None: debug.errorlog("Please specify the institution to scan.", display=True) return False try: mode = int(options.get("mode",1)) except ValueError: debug.errorlog("""Please specify a valid mode for this scan. 1) Scan an institution's collection 2) Scan the Top Collections chart 3) Scan the Top Downloads chart 4) Scan the list of institutions """, display=True) return False if mode < 1 or mode > 4: debug.errorlog("""Please specify a valid mode for this scan. 1) Scan an institution's collection 2) Scan the Top Collections chart 3) Scan the Top Downloads chart 4) Scan the list of institutions """, display=True) return False scantime = datetime.datetime.now(pytz.utc) print "Scan iTunes started at " + str(scantime) + "\n" scanlog = ItuScanLog(mode=mode, time=scantime, comments="") scanlog.save() if mode == 1: try: institution = ItuInstitution.objects.filter(name__iexact=institution_name)[0] except: debug.errorlog(institution_name + u" is not a recognised institution.", display=True) scanlog.delete() return False scanlog.institution = institution scanlog.save() comment = u"Scan (and update) of " + institution_name + u"\'s collection from %s" % institution.url debug.log(u"Log started for: %s" % unicode(comment), display=True) print comment print("Getting information about collections...") collections = itunes.get_institution_collections(institution, hurry=True) print("Processing collection information and scanning individual items...") collections_spotted = [] items_spotted = [] for collection_itunes in collections: if collection_itunes: # for k in collection_itunes.keys(): # print(k + ': ' + collection_itunes[k]) #Check if this collection's genre exists - if not, create it. genre = ItuGenre(name=collection_itunes['genre'], itu_id=int(collection_itunes['genre_id']), url=collection_itunes['genre_url']) genre_exists = False for saved_genre in ItuGenre.objects.all(): if int(genre.itu_id) == int(saved_genre.itu_id) and genre.name==saved_genre.name and genre.url==saved_genre.url: genre_exists = True genre = saved_genre if not genre_exists: debug.log(u'Created new genre ' + unicode(genre.name), display=True) genre.save() collection_record_absolute = ItuCollection(institution=institution) if collection_itunes['last modified']: last_modified = parse(collection_itunes['last modified']).date() else: last_modified = None collection_record_historical = ItuCollectionHistorical(name=collection_itunes['series'], itu_id=int(collection_itunes['series_id']), img170=collection_itunes['series_img_170'], url=collection_itunes['series_url'], language=collection_itunes['language'], last_modified=last_modified, contains_movies=collection_itunes['contains_movies'], missing=None, version=1, institution=institution, scanlog=scanlog, genre=genre, previous=None, itucollection=collection_record_absolute) rating_checksum = 0 for rating in collection_itunes['ratings']: rating_checksum += pow(10,rating['stars']) + (rating['count']/1000000000) #Put together a list of saved collection_record_historicals that look like they're the same as our collection_record_historical, really. similar_collection_records_historical = [] collection_record_historical_exists = False for collection_record_historical_saved in ItuCollectionHistorical.objects.filter((Q(name=collection_record_historical.name) & Q(contains_movies=collection_record_historical.contains_movies)) | Q(itu_id=collection_record_historical.itu_id) | Q(url=collection_record_historical.url)): #name AND Video/Audio if collection_record_historical.url != collection_record_historical_saved.url: #Don't add similar collection_record_historical if the URLs are different, but both are accessible. try: urllib2.urlopen(collection_record_historical.url) urllib2.urlopen(collection_record_historical_saved.url) except urllib2.URLError: similar_collection_records_historical.append(collection_record_historical_saved) else: similar_collection_records_historical.append(collection_record_historical_saved) if collection_record_historical.name==collection_record_historical_saved.name and collection_record_historical.contains_movies==collection_record_historical_saved.contains_movies and int(collection_record_historical.itu_id)==int(collection_record_historical_saved.itu_id) and collection_record_historical.url==collection_record_historical_saved.url and collection_record_historical.img170==collection_record_historical_saved.img170 and collection_record_historical.language==collection_record_historical_saved.language and rating_checksum==collection_record_historical_saved.rating_checksum(): collection_record_historical_exists=True collection_record_historical = collection_record_historical_saved else: similar_collection_records_historical.append(collection_record_historical_saved) if not collection_record_historical_exists: if similar_collection_records_historical: similar_collection_records_historical.sort(key=lambda this_collection_record_historical: this_collection_record_historical.version) latest_similar_collection_record_historical = similar_collection_records_historical[-1] collection_record_historical.previous = latest_similar_collection_record_historical collection_record_historical.version = latest_similar_collection_record_historical.version + 1 collection_record_historical.itucollection = latest_similar_collection_record_historical.itucollection else: collection_record_absolute.save() collection_record_historical.itucollection = collection_record_absolute debug.log(u'Created new historical collection record for ' + unicode(collection_record_historical.name) + u', version ' + unicode(collection_record_historical.version), display=True) collection_record_historical.save() for r in collection_itunes['ratings']: try: rating = ItuRating(stars=r['stars'], count=r['count'], itucollectionhistorical=collection_record_historical) rating.save() except: debug.log(u'WARNING: Failed to save rating.', display=True) for comment in collection_itunes['comments']: if comment and len(ItuComment.objects.filter(detail=comment['detail'])) == 0: try: new_comment = ItuComment(itucollectionhistorical=collection_record_historical, stars=comment['rating'], date=comment['date'], detail=comment['detail'], source=comment['source'], ituinstitution=institution) new_comment.save() debug.log(u'Saved new comment by ' + unicode(new_comment.source) + u': \"' + unicode(new_comment.detail) + u'\".', display=True) except: debug.log(u'WARNING: Failed to save comment.', display=True) collections_spotted.append(collection_record_historical) #Acquire the list of items for this collection. try: items = itunes.get_collection_items(collection_record_historical.url, hurry=True) except: debug.errorlog('Could not get items for collection ' + collection_record_historical.name + '.', display=True) items = [] for item in items: if item is not {}: #Dictionary will be blank if we have failed to retrieve data on an item. If so, don't do anything with the item. item_record_absolute = ItuItem(institution=institution) try: #Deal with things with no duration (like PDFs...) if 'duration' in item.keys(): item['duration'] = int(item['duration']) else: item['duration'] = None if 'songName' not in item.keys(): item['songName'] = item['playlistName'] + ' ' + str(item['rank']) + ' {UNKNOWN NAME}' item_record_historical = ItuItemHistorical(name=item['songName'], itu_id=item['itemId'], url=item['url'], artist_name=item['artistName'], description=item['description'], duration=item['duration'], explicit=bool(item['explicit']), feed_url=item['feedURL'], file_extension=item['fileExtension'], kind=item['kind'], long_description=item['longDescription'], playlist_id=int(item['playlistId']), playlist_name=item['playlistName'], popularity=float(item['popularity']), preview_length=int(item['previewLength']), preview_url=item['previewURL'], rank=int(item['rank']), release_date=pytz.utc.localize(parse(item['releaseDate'],ignoretz=True)), missing=None, version=1, previous=None, ituitem=item_record_absolute, institution=institution, genre=genre, scanlog=scanlog, series=collection_record_historical) except KeyError: #See if we've got data from a last-ditch attempt at downloading it instead. try: duration = 0 feedurl = "" for offerkey in item['store-offers'].keys(): #offerkey is something like 'standard-audio'. This code works on the assumption that, whatever the key, we want all the items in its list. try: duration = item['store-offers'][offerkey]['duration'] except KeyError: duration = None feedurl = item['store-offers'][offerkey]['asset-url'] item_record_historical = ItuItemHistorical(name=item['title'], itu_id=item['item-id'], url=item['url'], artist_name=item['artist-name'], description=item['description'], duration=duration, explicit=False, feed_url=feedurl, file_extension=feedurl.split('.')[-1], kind='unknown', long_description=item['long-description'], playlist_id=collection_record_historical.id, playlist_name=collection_record_historical.name, popularity=0.0, preview_length=0, preview_url='unknown', rank=int(item['track-number']), release_date=item['release-date'], missing=None, version=1, previous=None, ituitem=item_record_absolute, institution=institution, genre=genre, scanlog=scanlog, series=collection_record_historical) except KeyError: debug.errorlog(u'Missing key when trying to create an ItuItemHistorical. item=' + unicode(item), display=True) except: debug.errorlog(u'Failed to process ItuItemHistorical.', display=True) try: #We can't afford this bit to die in the middle of the night. # Put together a list of saved item_record_historicals that look like they're the same as our item_record_historical, really. similar_item_record_historicals = [] item_record_historical_exists = False for saved_item_record_historical in ItuItemHistorical.objects.filter(Q(series__itucollection=collection_record_historical.itucollection) & (Q(name=item_record_historical.name) | Q(itu_id=item_record_historical.itu_id) | Q(url=item_record_historical.url)) & Q(file_extension=item_record_historical.file_extension)): #name AND Video/Audio if item_record_historical.url != saved_item_record_historical.url: #Don't add similar item_record_historical if the URLs are different, but both are accessible. try: urllib2.urlopen(item_record_historical.url) urllib2.urlopen(saved_item_record_historical.url) except urllib2.URLError: similar_item_record_historicals.append(saved_item_record_historical) else: if item_record_historical.name==saved_item_record_historical.name and item_record_historical.itu_id==saved_item_record_historical.itu_id and item_record_historical.url==saved_item_record_historical.url and item_record_historical.artist_name==saved_item_record_historical.artist_name and item_record_historical.description==saved_item_record_historical.description and item_record_historical.duration==saved_item_record_historical.duration and item_record_historical.explicit==saved_item_record_historical.explicit and item_record_historical.feed_url==saved_item_record_historical.feed_url and item_record_historical.file_extension==saved_item_record_historical.file_extension and item_record_historical.kind==saved_item_record_historical.kind and item_record_historical.long_description==saved_item_record_historical.long_description and item_record_historical.playlist_id==saved_item_record_historical.playlist_id and item_record_historical.playlist_name==saved_item_record_historical.playlist_name and item_record_historical.popularity==saved_item_record_historical.popularity and item_record_historical.preview_length==saved_item_record_historical.preview_length and item_record_historical.preview_url==saved_item_record_historical.preview_url and item_record_historical.rank==saved_item_record_historical.rank and item_record_historical.release_date==saved_item_record_historical.release_date: item_record_historical_exists = True item_record_historical = saved_item_record_historical else: similar_item_record_historicals.append(saved_item_record_historical) if not item_record_historical_exists: if similar_item_record_historicals: similar_item_record_historicals.sort(key=lambda this_item_record_historical: this_item_record_historical.version) latest_similar_item_record_historical = similar_item_record_historicals[-1] item_record_historical.previous = latest_similar_item_record_historical item_record_historical.version = latest_similar_item_record_historical.version + 1 item_record_historical.ituitem = latest_similar_item_record_historical.ituitem else: item_record_absolute.save() item_record_historical.ituitem = item_record_absolute debug.log(u'Created new historical item record for ' + unicode(item_record_historical.name) + u', version ' + unicode(item_record_historical.version), display=True) item_record_historical.save() items_spotted.append(item_record_historical) except: debug.errorlog(u'Failed to process potential historical item record.', display=True) else: debug.log(u'WARNING: Blank item - perhaps we couldn\'t download the appropriate page?', display=True) else: debug.log(u'WARNING: Blank category - perhaps we couldn\'t download the appropriate page?', display=True) print(u"Checking whether anything has gone missing or reappeared...") if collections: counter = 0 for historical_collection_record in ItuCollectionHistorical.objects.filter(Q(institution=institution) & Q(itucollection__latest=F('id'))): if historical_collection_record not in collections_spotted and historical_collection_record.missing == None: debug.log(unicode(historical_collection_record.name) + u" appears to have gone missing! We last saw it at " + unicode(historical_collection_record.scanlog.time), display=True) historical_collection_record.missing = scanlog historical_collection_record.save() elif historical_collection_record in collections_spotted and historical_collection_record.missing: debug.log(unicode(historical_collection_record.name) + u" has reappeared! It went missing at " + unicode(historical_collection_record.missing.time), display=True) historical_collection_record.missing = None historical_collection_record.save() counter += 1 if float(counter)/100.0 == int(float(counter)/100.0): print (u'Still checking... (at object ' + unicode(counter) + u')') for historical_item_record in ItuItemHistorical.objects.filter(Q(institution=institution) & Q(ituitem__latest=F('id'))): if historical_item_record not in items_spotted and historical_item_record.missing == None: debug.log(unicode(historical_item_record.name) + u" appears to have gone missing! We last saw it at " + unicode(historical_item_record.scanlog.time), display=True) historical_item_record.missing = scanlog historical_item_record.save() elif historical_item_record in items_spotted and historical_item_record.missing: debug.log(unicode(historical_item_record.name) + u" has reappeared! It went missing at " + unicode(historical_item_record.missing.time), display=True) historical_item_record.missing = None historical_item_record.save() counter += 1 if float(counter)/100.0 == int(float(counter)/100.0): print (u'Still checking... (at object ' + unicode(counter) + u')') else: debug.log(u"WARNING: No collections found. Perhaps you scanned an institution that only publishes courses?", display=True) elif mode == 2: comment = u"Scan of the Top Collections Chart..." debug.log(u"Log started for: %s" % unicode(comment), display=True) updated_institutions = False collections = itunes.get_topcollections() for collection in collections: if collection: try: historical_collections=ItuCollectionHistorical.objects.filter(url=collection['series_url']) if not historical_collections: debug.log(u'WARNING: Couldn\'t find an historical record of collection at ' + unicode(collection['series_url']) + u'. Attempting an historical scan of ' + unicode(collection['institution']) + u' first...', display=True) if not updated_institutions: management.call_command('scan_itunes', mode=4) updated_institutions = True try: management.call_command('scan_itunes', collection['institution'], mode=1) except: try: #Deal with institutions which aren't listed by Apple. institution = ItuInstitution(name = collection['institution'], itu_id = int(collection['institution_id']), url = collection['institution_url']) institution.save() management.call_command('scan_itunes', collection['institution'], mode=1) except: debug.errorlog('Failed to scan institution ' + collection['institution'] + '. Perhaps this institution isn\'t listed by Apple?', display=True) historical_collections=ItuCollectionHistorical.objects.filter(url=collection['series_url']) if historical_collections.exists(): historical_collection=historical_collections[0].latest() debug.log(u'Creating new chart row: ' + unicode(historical_collection.name) + u' Position: ' + unicode(collection['chart_position']), display=True) chartrow=ItuCollectionChartScan(position=int(collection['chart_position']), itucollection=historical_collection.itucollection, itucollectionhistorical=historical_collection, scanlog=scanlog, date=scanlog.time) chartrow.save() else: debug.errorlog(u'Couldn\'tfind an historical record of collection at ' + unicode(collection['series_url']) + u' despite updating the database.', display=True) except KeyError: debug.errorlog('WARNING: Couldn\'t access collection (KeyError):' + str(collection), display=True) elif mode == 3: comment = u"Scan of the Top Downloads Chart..." debug.log(u"Log started for: %s" % unicode(comment), display=True) updated_institutions = False items = itunes.get_topdownloads() for item in items: if item: try: historical_items=ItuItemHistorical.objects.filter(name=item['item']) if not historical_items: debug.log(u'WARNING: Couldn\'t find an historical record of item at ' + unicode(item['item_url']) + u'. Attempting an historical scan of ' + unicode(item['institution']) + u' first...', display=True) if not updated_institutions: management.call_command('scan_itunes', mode=4) updated_institutions = True try: management.call_command('scan_itunes', item['institution'], mode=1) except: try: #Deal with institutions which aren't listed by Apple. institution = ItuInstitution(name = item['institution'], itu_id = int(item['institution_id']), url = item['institution_url']) institution.save() management.call_command('scan_itunes', item['institution'], mode=1) except: debug.errorlog('Failed to scan institution ' + item['institution'] + '. This is a bug.', display=True) historical_items=ItuItemHistorical.objects.filter(name=item['item']) if historical_items.exists(): historical_item=historical_items[0].latest() debug.log(u'Created new download chart row: ' + unicode(historical_item.name) + u' Position: ' + unicode(item['chart_position']), display=True) chartrow=ItuItemChartScan(position=int(item['chart_position']), ituitem=historical_item.ituitem, ituitemhistorical=historical_item, scanlog=scanlog, date=scanlog.time) chartrow.save() else: debug.errorlog(u'Couldn\'t find an historical record of item at ' + unicode(item['item_url']) + u' despite updating the database.', display=True) except KeyError: debug.errorlog('WARNING: Couldn\'t access item (KeyError):' + str(item), display=True) elif mode == 4: comment = "Scan of list of institutions..." debug.log(u"Log started for: %s" % unicode(comment)) print comment institutions = itunes.get_institutions() for institution_itunes in institutions: if institution_itunes: institution = ItuInstitution(name = institution_itunes['text'], itu_id = int(institution_itunes['itu_id']), url = institution_itunes['url']) need_update = False need_create = True for saved_institution in ItuInstitution.objects.filter(Q(itu_id=institution.itu_id) | Q(name=institution.name) | Q(url = institution.url)): if saved_institution.itu_id == institution.itu_id and saved_institution.name == institution.name and saved_institution.url == institution.url: need_update = False need_create = False else: need_update = True need_create = False saved_institution.itu_id = institution.itu_id saved_institution.name = institution.name saved_institution.url = institution.url institution = saved_institution if need_update: debug.log(u'Updated institution ' + unicode(institution.name), display=True) institution.save() elif need_create: debug.log(u'Created new institution ' + unicode(institution.name), display=True) institution.save() else: debug.errorlog(u"We shouldn't ever get this scan...", display=True) print "\nScan iTunes finished at " + str(datetime.datetime.now(pytz.utc)) # Write the error cache to disk debug.errorlog_save() debug.errorlog_stop() scanlog.complete = True scanlog.save() return None
def getMelonChart(maxRank = 50, period_type ='weekly', str_target_date=None): period_url = {'weekly': 'week', 'monthly': 'month', 'yearly': '', 'decennial': ''} if maxRank < 1: maxRank = 1 elif maxRank > 50: maxRank = 50 if str_target_date == None or str_target_date == '': if period_type == 'weekly': str_target_date = (date.today() - timedelta(days=date.today().isoweekday())).strftime('%Y%m%d') else: str_target_date = date.today().strftime('%Y%m%d') debug.log('target date={}'.format(str_target_date)) target_date = date(int(str_target_date[0:4]), int(str_target_date[4:6]), int(str_target_date[6:8])) if target_date < date(1990, 1, 7): target_date = date(1990, 1, 7) elif target_date > date.today(): target_date = date.today() if period_type == 'weekly': strTimeFormat = '%Y%m%d' if isWeekStartedFromSunday(target_date): startDay = target_date - timedelta(days=target_date.isoweekday()%7) else: startDay = target_date - timedelta(days=target_date.weekday()) endDay = startDay + timedelta(days=6) if not isWeekStartedFromSunday(startDay) and isWeekStartedFromSunday(endDay): endDay = endDay - timedelta(days=1) if isWeekStartedFromSunday(startDay) and not isWeekStartedFromSunday(endDay): target_date = target_date - timedelta(days=1) startDay = target_date - timedelta(days=target_date.isoweekday()%7) endDay = startDay + timedelta(days=6) if target_date.year < 2017: if target_date < date(2009, 11, 1): if target_date < date(2004, 11, 22): classCd = 'KPOP' else: classCd = 'CL0000' else: classCd = 'DP0000' else: classCd = 'GN0000' url_param = 'chartType=WE&classCd={}&startDay={}&endDay={}'.format( classCd, startDay.strftime(strTimeFormat), endDay.strftime(strTimeFormat) ) period_str = '{}-{}'.format(startDay.strftime('%Y.%m.%d'), endDay.strftime('%Y.%m.%d')) elif period_type == 'monthly': strYearFormat = '%Y' strMonthFormat = '%m' today = date.today() if target_date.year == today.year and target_date.month == today.month: target_date = (target_date.replace(day=1) - timedelta(days=1)).replace(day=1) rankYear = target_date.strftime(strYearFormat) rankMonth = target_date.strftime(strMonthFormat) if target_date.year < 2017: if target_date < date(2004, 11, 1): classCd = 'KPOP' else: classCd = 'DP0000' else: classCd = 'GN0000' url_param = 'chartType=MO&year={}&mon={}&classCd={}'.format(rankYear, rankMonth, classCd) period_str = '{}'.format(target_date.strftime('%Y.%m')) elif period_type == 'yearly': strYearFormat = '%Y' today = date.today() if target_date.year >= today.year: target_date = target_date.replace(year=today.year-1, month=1, day=1) rankYear = target_date.strftime(strYearFormat) classCd = 'KPOP' url_param = 'chartType=YE&year={}&classCd={}'.format(rankYear, classCd) period_str = '{}'.format(target_date.strftime('%Y')) else: # decennial strYearFormat = '%Y' today = date.today() today = today.replace(year=(today.year // 10) * 10, month=1, day=1) target_date = target_date.replace(year=(target_date.year // 10) * 10, month=1, day=1) if target_date >= today: target_date = target_date.replace(year=today.year-10, month=1, day=1) rankYear = target_date.strftime(strYearFormat) classCd = 'KPOP' url_param = 'chartType=AG&age={}&classCd={}'.format(rankYear, classCd) period_str = '{}s'.format(target_date.strftime('%Y')) url = "http://www.melon.com/chart/search/list.htm?{}&moved=Y".format(url_param) debug.log("Request chart to melon by query < {} >".format(url)) content = http.getHTMLDocument(url) # debug.log(content) soup = BeautifulSoup(content, "html.parser") # debug.log(soup) chart_name = 'melon_{}_'.format(period_type) + period_str debug.log(chart_name) table = soup.find('tbody', {'id':'chartListObj'}) # debug.log(table) debug.log('') count = 1 chart_list = [] for music in table.find_all('tr', {'class':'lst50'}): if count > maxRank: break # image = music.find('img') links = music.find_all('a') if len(links) > 3: artist , title, songID, coverImgFile, lyric, albumID = getSongInfoOfMelon(music) debug.log('{:02}. {} - {} (id:{}, {})'.format(count, artist, title, songID, coverImgFile)) chart_list.append({'rank':count, 'artist':artist, 'title':title, 'songID':songID, 'albumID':albumID, 'lyric':lyric}) # debug.log(lyric) count += 1 return chart_name, chart_list
# debug.log(table) debug.log('') count = 1 chart_list = [] for music in table.find_all('tr', {'class':'lst50'}): if count > maxRank: break # image = music.find('img') links = music.find_all('a') if len(links) > 3: artist , title, songID, coverImgFile, lyric, albumID = getSongInfoOfMelon(music) debug.log('{:02}. {} - {} (id:{}, {})'.format(count, artist, title, songID, coverImgFile)) chart_list.append({'rank':count, 'artist':artist, 'title':title, 'songID':songID, 'albumID':albumID, 'lyric':lyric}) # debug.log(lyric) count += 1 return chart_name, chart_list if __name__ == '__main__': # lyric, artist, title, albumID, imgUrl = getSongInfobySongIDOfMelon('30989550') # print(lyric) # print(artist) # print(title) # print(albumID) # print(imgUrl) # chartlist = getMelonChart() chartlist = getMelonChart(period_type='monthly', str_target_date='19901001') # chartlist = getMelonChart(period_type='weekly', str_target_date='20041120') for song in chartlist: debug.log(song)
def getSongFromYouTube(artist, title, songID, lyric, albumID, baseMusicDir, baseImageDir, youtube_api_key, isOverwriteMode=False, music_reporter=None): audio_name = '{}-{}'.format(artist, title) query = '{} audio'.format(audio_name) # check whether mp3 file already exists. if not os.path.exists(baseMusicDir): os.mkdir(baseMusicDir) filename = convertQueryToFilename(audio_name) mp3_parent = os.path.join(baseMusicDir, convertQueryToFilename(artist)) if not os.path.exists(mp3_parent): os.mkdir(mp3_parent) mp3_dir = os.path.join(convertQueryToFilename(artist), albumID) if not os.path.exists(os.path.join(baseMusicDir, mp3_dir)): os.mkdir(os.path.join(baseMusicDir, mp3_dir)) mp3_filename = filename + '.mp3' mp3_path = os.path.join(mp3_dir, mp3_filename) isSkip = False if os.path.exists(os.path.join(baseMusicDir, mp3_path)): if isOverwriteMode: debug.log('{} is already exist. it will be overwritten.'.format(mp3_path)) os.remove(os.path.join(baseMusicDir, mp3_path)) else: debug.log('{} is already exist. Downloading will be skipped.'.format(mp3_path)) isSkip = True if not isSkip: debug.log('Looking for youtube by the query \'{}\'...'.format(query)) list, _ = find_youtube_by_api(query, youtube_api_key) retry = 0 while len(list) <= 1 and retry < 5: debug.log('Youtube list couldn\'t be gotten. retry...') list, _ = find_youtube_by_api(query, youtube_api_key) retry += 1 debug.log('trying to download \'' + query + '\'...') file_name = download_audio_from_youtube(list[0], baseMusicDir, audio_name, music_reporter) debug.log('\'' + file_name + '\' was downloaded.') debug.log('\'' + file_name + '\' is converting...') convertMP3(baseMusicDir, file_name, mp3_path) debug.log('\'' + mp3_path + '\' was converted.') img_path = os.path.join(baseImageDir, songID + '.jpg') setID3(baseMusicDir, mp3_path, artist, title, lyric, albumID, img_path) os.remove(img_path) debug.log('Song Information was recorded on \'' + mp3_path + '\'') return mp3_path
def repair_music(): f = open("youtube_api_key.txt", "r") youtube_api_key = f.readline().split()[0] f.close() filename = FLAGS.path.split(os.sep)[-1] target_dir = FLAGS.path.replace(os.sep + filename, '') # check whether the target file exists and the file condision is satisfied. if not os.path.exists(FLAGS.path): debug.log('There is no target file.') return if not (os.path.isfile(FLAGS.path) or FLAGS.path.split('.')[-1] == 'mp3'): debug.log('It is not MP3 file.') return # get ID3 tag and query id3_tag = getID3Tag(FLAGS.path) if id3_tag == None: debug.log( 'There is no ID3 Tag in the target mp3 file. Please check the file information' ) return audio_name = getAudioNameFromID3(id3_tag) # search for youtube query = '{} audio'.format(audio_name) debug.log('Looking for youtube by the query \'{}\''.format(query)) list = ye.find_youtube_detailed_by_api(query, youtube_api_key) count = 0 for link in list: count += 1 print("[{}] title:'{}', length:{}, link:< {} >".format( count, link['title'], link['length'], link['url'])) selected_num = -1 while selected_num < 0 or selected_num > count: try: selected_num = int( input( "Please choose the link number(1<=NUM<={}) of music to repair (input '0' if you want to exit): " .format(count))) except ValueError: selected_num = -1 continue if selected_num == 0: return if selected_num < 0 or selected_num > count: print( 'Input number is out of range (0<=NUM<={}). Try to input again.' .count()) print("\n[{}]({}<{}>) is selected.".format(selected_num, list[selected_num - 1]['title'], list[selected_num - 1]['url'])) old_filename = "{}_old.mp3".format(filename.split('.mp3')[0]) old_file_path = os.path.join(target_dir, old_filename) if os.path.exists(old_file_path): os.remove(old_file_path) debug.log('Previous old mp3 file is removed.') os.rename(FLAGS.path, old_file_path) debug.log( "The name of previous file is changed to '{}'".format(old_filename)) mr = music_reporter.MusicReporter('logs', 'report.log') conv_filename = ye.convertQueryToFilename(audio_name) output_filename = ye.download_audio_from_youtube(list[selected_num - 1]['url'], output_dir=target_dir, strQuery=conv_filename, music_reporter=mr) del mr debug.log('\'' + output_filename + '\' was downloaded.') debug.log('\'' + output_filename + '\' is converting...') ye.convertMP3(target_dir, output_filename, conv_filename + '.mp3') debug.log('\'' + FLAGS.path + '\' was converted.') setID3Tag(FLAGS.path, id3_tag) debug.log('Song Information was recorded on \'' + FLAGS.path + '\'')