-
Notifications
You must be signed in to change notification settings - Fork 0
/
lyrscraper.py
128 lines (93 loc) · 3.12 KB
/
lyrscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""Lyrscraper (c) 2015 by drachenminister
This module provides functions to scrape song lyrics off of
lyrics.wikia.com only but it can easyly be adopted to work with other
sites aswell.
How it works:
The API of lyrics.wikia.com only provides a smal section of the
lyrics alongside with an URL of a web page that displays the
complete lyrices.
Therefor to get the lyrics we first call the API for that URL and
then scrape the apropriate section of the resulting page.
How it can be adapted to use other sities:
Asuming the same two step aproach is needed: For step 1 you would
have to change the APIs URL, parameters and untangling.
For step 2 you would have to check string replacements and xpath
definition.
"""
import urllib
import untangle
import requests
from lxml import html
def _url(id3tag):
"""Get the lyric's URL by calling the API
Extract song and artist frames from the id3tag and then call
the lyrics.wikia.com api for the URL of the actual lyric.
Args:
id3tag (ID3): id3-tag of the song
Returns:
string: lyric URL if successful, None otherwise
"""
try:
# Get artist and song from tag
artist = id3tag['TPE1'].text[0].encode('utf8')
song = id3tag['TIT2'].text[0].encode('utf8')
# Build request
params = {'artist':artist, 'song':song, 'fmt':'xml'}
request = 'http://lyrics.wikia.com/api.php?' + urllib.urlencode(params)
# Do request
response = untangle.parse(request)
if response.LyricsResult.lyrics.cdata == 'Not found':
return None
return urllib.unquote(response.LyricsResult.url.cdata)
except Exception:
return None
def scrape(url):
"""Open the URL and scrape the lyric.
Opens the URL and scrapes the lyric off the apropriate
div stripping anything else but text carefully keeping
blank lines though.
Args:
url (string): URL to scrape
Returns:
string: Lyric if successful, None otherwise
"""
try:
# Get page
page = requests.get(url)
page.raise_for_status()
# Scrape page
tree = html.fromstring(page.text.replace('<br />', '\n'))
return tree.xpath('//div[@class="lyricbox"]/text()')[0]
except Exception:
return None
def lyric(id3tag):
"""Return the lyric to a song
Takes an id3 tag and returns the lyric of the correspondig
song as a unicode string
Args:
id3tag (ID3): id3-tag of the song
Returns:
string: Lyric if successful, None otherwise
"""
# Get lyric url
lyrurl = _url(id3tag)
if not lyrurl:
return None
# Scrape url
return scrape(lyrurl)
if __name__ == '__main__':
"""Print lyric to standard out
Takes a filename from commandline extracts the id3-tag and prints
the lyric according to song and artist frames to standard out.
Args:
sys.argv[1] (string): Filename
"""
import sys
from mutagen.id3 import ID3
try:
tag = ID3(sys.argv[1])
print lyric(tag)
except Execption:
pass