def get_song_lyrics(parsed_url, url): """ Takes in two urls--sometimes, with artist collaborations, the second url is not a relative URL, but a complete one. I use urlparse to join the two urls such that they are baseURL invariant. Returns the song lyrics, and prints "Success!" if successful. """ if url.startswith(".."): url = urlparse.urljoin(parsed_url.geturl(), url) print "Parsing %s..." % (url) page = urlz.open_and_read(url) soup = BeautifulSoup(page, "lxml") page_text = soup.get_text() lyrics = page_text[(page_text.index("Print") + 5):page_text.index("if ( /Android")].strip("\n") if type(lyrics) == None: print "Nothing returned." elif len(lyrics) > 0: print "Success!" else: print "Lyrics are empty." return lyrics
def scrape_artist_for_song_urls(url): """ Take in a url, opens it, and parses the page for song URLs and names. """ try: parsed_url = urlparse.urlparse(url) artist = get_name_from_url(url) page = urlz.open_and_read(url) soup = BeautifulSoup(page, "lxml") songlist_tag = string.join(find_songlist_tag(soup).split("}];")[0].split("\n")[1:]).strip() json_arg = "{%s}" % (songlist_tag.split("[", 1)[1].rsplit("]", 1)[0].lstrip(" ")) song_list = decode_json(json_arg) write_songs(artist, song_list, parsed_url) except: print "Skipping %s" % (url)