-
Notifications
You must be signed in to change notification settings - Fork 0
/
retrieve_metadata.py
228 lines (203 loc) · 7.9 KB
/
retrieve_metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import os
import eyed3
import re
import sys
import urllib
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from mutagen.mp3 import MP3
from mutagen.id3 import ID3, APIC, error
from mutagen.easyid3 import EasyID3
from mutagen import File
import logger_setup as ls
logger = ls.get_logger(__name__)
amazon_dict = {'space_delim':'+', 'search_url':'http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Ddigital-music&field-keywords=',
'table_class':'mp3Tracks', 'by_method':By.ID, 'no_results_locator':'noResultsTitle', 'result_class':'result',
'title_locator': 'title', 'artist_locator': 'mp3tArtist', 'album_locator': 'mp3tAlbum'}
soundcloud_dict = {'space_delim':'%20', 'search_url':'https://soundcloud.com/search?q=', 'table_class':'lazyLoadingList__list',
'by_method':By.CSS_SELECTOR, 'no_results_locator':'.sc-type-h2.sc-text', 'result_class':'searchList__item',
'title_locator': 'soundTitle__title', 'artist_locator': 'soundTitle__username'}
def get_metadata_for_song(song_path, song_name, driver):
amazon_song_info = get_song_info(driver, song_name, 'amazon')
soundcloud_song_info = get_song_info(driver, song_name, 'soundcloud')
all_song_info = amazon_song_info + soundcloud_song_info
all_info_w_artwork = get_artwork(driver, all_song_info)
current_song_info = get_current_metadata(song_path, song_name)
return current_song_info + all_info_w_artwork
def get_current_metadata(song_path, song_name):
try:
tags = ID3(song_path)
except Exception,e:
logger.error('couldn`t get tags on the song for this reason:')
logger.error(e)
logger.info('skipping')
return []
try:
title = tags["TIT2"].text[0]
except:
title = ''
try:
artist = tags["TPE1"].text[0]
except:
artist = ''
try:
album = tags["TALB"].text[0]
except:
album = ''
mfile = File(song_path)
file_key = song_name + '_default'
file_path = '-'
file_url = ''
try:
apic = mfile.tags['APIC:']
mime_sp = apic.mime.split('/')
ext = mime_sp[len(mime_sp) - 1]
artwork = apic.data # access APIC frame and grab the image
cwd = os.getcwd()
file_path = cwd + '/art_dump/' + file_key + '.' + ext
file_url = 'file://' + file_path
with open(file_path, 'wb') as img:
img.write(artwork)
except Exception,e:
logger.warn('failed to get artwork attached to mp3, probably doesn`t have any. here`s the exception:')
logger.warn(e)
song_dict = {'title':title, 'artist':artist, 'album':album, 'local_art':file_path, 'art_url':file_url, 'file_key':file_key}
return [song_dict]
def get_song_info(driver, name, source):
props = amazon_dict if source is 'amazon' else soundcloud_dict
query = re.sub('_', props['space_delim'], name)
url = props['search_url'] + query
logger.info('getting url: ' + url)
driver.get(url)
try:
logger.info('looking for search results list...')
table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, props['table_class'])))
driver.implicitly_wait(2)
except TimeoutException:
logger.error("took too long to find results table, checking for failed search...")
try:
thing = driver if source is 'amazon' else table
no_res = thing.findElement(props['by_method'], props['no_results_locator'])
logger.info("yep, no results")
except Exception,e:
logger.info("strange, couldn`t find failed search page either; slow internet maybe?")
return []
logger.info('results table found!')
rows = table.find_elements_by_class_name(props['result_class'])
results = []
for i in range(0, 4):
try:
row = rows[i]
except:
if i is 0:
logger.error('no ' + source + ' results found for ' + name)
else:
logger.warn(source + ' search exhausted after ' + str(i) + ' results')
break
try:
title_elem = row.find_element_by_class_name(props['title_locator'])
title = title_elem.text.encode('utf8') #str(title_elem.text)
title = re.sub('/', '', title)
artist = row.find_element_by_class_name(props['artist_locator']).text.encode('utf8') #str(row.find_element_by_class_name(props['artist_locator']).text)
if source is 'amazon':
album = row.find_element_by_class_name(props['album_locator']).text.encode('utf8') #str(row.find_element_by_class_name(props['album_locator']).text)
else:
album = 'soundcloud result, album unknown'
details_url = str(title_elem.get_attribute('href'))
key_title = re.sub(' ', '_', title)
file_key = key_title + '_' + source + str(i)
details_dict = {'title':title, 'artist':artist, 'album':album, 'details':details_url, 'file_key':file_key}
results.append(details_dict)
except:
logger.error('something went wrong getting details, checking for promoted link or user link...')
try:
promoted = row.find_element_by_class_name('promotedBadge')
logger.info('yep, promoted link. skipping!')
except:
try:
user = row.find_element_by_class_name('userStats')
logger.info('yep, user link. skipping!')
except:
logger.info('doesn`t seem to be promoted or user link, not sure what`s wrong')
return results
def return_amazon_art_url(artwork_cont):
return artwork_cont.find_element_by_css_selector('img').get_attribute('src')
def return_soundcloud_art_url(artwork_cont):
style = artwork_cont.get_attribute('style')
splits = style.split(';')
back_array = [s for s in splits if 'background-image' in s]
back = str(back_array[0])
start = back.index('(') + 1
end = back.index(')')
https_url = back[start:end]
return https_url.replace('https', 'http')
def get_artwork(driver, metadata):
with_arturls = []
for song_dict in metadata:
try:
details_url = song_dict["details"]
driver.get(details_url)
if 'amazon' in details_url:
by_method = By.ID
locator = 'coverArt_feature_div'
url_func = return_amazon_art_url
else:
by_method = By.CLASS_NAME
locator = 'image__full'
url_func = return_soundcloud_art_url
try:
artwork_cont = WebDriverWait(driver, 10).until(EC.presence_of_element_located((by_method, locator)))
art_url = url_func(artwork_cont)
ext = art_url[-3:]
file_key = song_dict["file_key"].decode('utf8')
file_path = os.getcwd() + '\\art_dump\\' + file_key + '.' + ext
urllib.urlretrieve(art_url, file_path)
song_dict['art_url'] = str(art_url)
song_dict['local_art'] = file_path.encode('utf8')
with_arturls.append(song_dict)
except Exception,e:
logger.error('failed to save artwork for some reason:')
logger.error(e)
except Exception,e:
logger.error('an unexpected error happened somewhere:')
logger.error(e)
return with_arturls
def already_marked(file_path):
try:
tags = EasyID3(file_path)
except Exception,e:
logger.info('no tags on the song, proceed')
return False
try:
marked = tags['website'][0]
if marked == 'connerjevning.com':
logger.info('already marked! skipping')
return True
else:
return False
except:
logger.info('doesn`t appear to be marked, proceed')
return False
def write_html_for_song(file_path, data):
with open(file_path, 'w') as myFile:
myFile.write('<html><body><table style="text-align:center;"><tr><td><h1>Option</h1></td><td><h1>Title</h1></td><td><h1>Artist</h1></td>')
myFile.write('<td><h1>Album</h1></td><td><h1>Artwork</h1></td></tr>')
for i in range(0, len(data)):
myFile.write('<tr><td><h1>')
myFile.write(str(i))
myFile.write('</h1></td><td><p>')
myFile.write(data[i]['title'])
myFile.write('</p></td><td><p>')
myFile.write(data[i]['artist'])
myFile.write('</p></td><td><p>')
myFile.write(data[i]['album'])
myFile.write('</p></td><td><img src="')
myFile.write(data[i]['art_url'])
myFile.write('"></td></tr>')
myFile.write('</table>')
myFile.write('</body>')
myFile.write('</html>')