def parse_lyricsPage(self, response): item = LyricsItem() item['artist'] = response.xpath( '//*[@class="text_artist"]/a/text()').extract() item['text'] = response.xpath( '//*[@class="lyrics_container"]/div/p/a/text()').extract() yield item
def parse_getsong(self,response): global how_many_songs singer = ''.join(response.xpath(u'//*[@id="Tb3"]/a[3]/text()').extract()).encode('utf-8') first = 1 last = '' for i in response.selector.xpath(u'//*[contains(concat( " ", @class, " " ), concat( " ", "ha0", " " ))]//dd')[3:]: this = "".join(i.xpath('span[2]/a[1]/text()').extract()).encode('utf-8') no = "".join(i.xpath('span[2]/a[2]/font').extract()).encode('utf-8') year = ("".join(i.xpath('span[5]/text()').extract()))[0:4] if this != "" and (not no) and len(year)==4 : year = int(year) if year >= 2003: print this print year if first: how_many_songs+=1 last = this first-=1 f.write(str(how_many_songs)+','+this+','+singer+','+'\n') url = 'https://mojim.com'+str(i.xpath('span[2]/a[1]/@href').extract()[0]) print url item = LyricsItem() item['number'] = str(how_many_songs) request = scrapy.Request(url=url,callback=self.parse_getlyrics) request.meta['item'] = item yield request else: if(last==this): continue else: last = this how_many_songs+=1 f.write(str(how_many_songs)+','+this+','+singer+','+'\n') url = 'https://mojim.com'+str(i.xpath('span[2]/a[1]/@href').extract()[0]) print url item = LyricsItem() item['number'] = str(how_many_songs) request = scrapy.Request(url=url,callback=self.parse_getlyrics) request.meta['item'] = item yield request
def parse_dir_contents(self, response): item = LyricsItem() # url item['url'] = response.url # song title title = response.xpath("//div[contains(@class, 'site-inner')]//header[contains(@class, 'entry-header')]/h1/text()").extract()[0] item['title'] = re.split('[\–|-]', title)[1].strip() # artist name item['artist'] = response.xpath("//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-3-6')]//span[contains(@class, 'entry-categories')]/a/text()").extract() # lyricist item['lyricist'] = response.xpath("//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-2-6')]//span[contains(@class, 'lyrics')]/a/text()").extract() # musicComposer item['musicComposer'] = response.xpath("//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-2-6')]//span[contains(@class, 'music')]/a/text()").extract() # genre item['genre'] = response.xpath("//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-3-6')]//span[contains(@class, 'entry-tags')]/a/text()").extract() # views views = response.xpath("//div[contains(@class, 'entry-content')]/div[contains(@class, 'tptn_counter')]/text()").extract()[0] item['views'] = int(re.sub('[^0-9,]', "", views).replace(',', '')) #shares shares = response.xpath("//div[contains(@class, 'entry-content')]//div[contains(@class, 'nc_tweetContainer swp_share_button total_shares total_sharesalt')]/span[contains(@class, 'swp_count')]/text()").extract()[0] item['shares'] = int(re.sub('[^0-9,]', "", shares).replace(',', '')) # lyrics lyrics = response.xpath("//div[contains(@class, 'entry-content')]//pre/text()").extract() song = '' check = False for line in lyrics: lines = (re.sub("[\da-zA-Z\-—\[\]\(\)\}\{\@\_\!\#\+\$\%\^\&\*\<\>\?\|\~\:\∆\/]", "", line)).split('\n') for line_l in lines: if not(line_l.isspace() or line_l == ""): song += line_l.strip() check = True else: if check: song += '\\n' check = False item['lyrics'] = song yield item
def parse_dir_contents(self, response): try: genre_text = response.xpath( "//body//div[@id='mw-pages']//h2//span//text()").extract() genre_text = genre_text[0] genre = genre_text.split("Genre/")[-1].strip('"') except: genre_text = "" try: artists = response.xpath( '//body//div[@id="mw-pages"]//div[@class="mw-content-ltr"]') artists = artists.xpath('//tr//ul//li//a') except: artists = [] item = LyricsItem() item["url"] = response.urljoin(url) item["genre"] = "" item["lyrics"] = "" yield item if artists: for sel in artists: item = LyricsItem() url = sel.xpath('@href').extract()[0] url = response.urljoin(url) title = sel.xpath('@title').extract()[0] item["url"] = url item["title"] = title item["genre"] = genre yield item
def parse_item(self, response): page = response.url.split("/")[-1] page_dirname = 'songpages' filename = '%s.html' % page with open(os.path.join(page_dirname, filename), 'wb') as f: f.write(response.body) self.log('Saved file %s' % filename) # second part: extract text for the item for document corpus item = LyricsItem() item['url'] = response.url item['title'] = response.css('h1::text').extract_first() item['text'] = response.xpath('//div/text()').extract() return item
def parse_lyrics(self,text,output): item = LyricsItem() for line in re.finditer('](.*)[::](.*)\\n',text): if len(line.group(2))>8: continue k = line.group(1).strip() if len(k)>8: continue if text.count(k)>1 and k not in ['作词','作曲','编曲']: continue for v in re.split('[、/]',line.group(2)): if v.strip()!='': output[k] = v.strip() if len(output)>1: item['data'] = output yield item
def parse_page(self, response): artist = response.xpath("//div[@class='artist-header content-header row']/div/h1/text()").extract()[0].strip() top_list_song = response.xpath("//table") songList = top_list_song.xpath(".//a/@href").extract() for url in songList[0:min(self.settings["SONGS_PER_ARTIST"], len(songList) - 1)]: print("URL " + url) for url in songList[0:min(self.settings["SONGS_PER_ARTIST"], len(songList) - 1)]: item = LyricsItem() item["lyricsURL"] = url item["artist"] = artist req = Request(url, callback=self.parse_lyrics) req.meta["item"] = item yield req
def parse_page(self, response): item = LyricsItem() item["lyricsURL"] = response.url print("PARSING URL: " + response.url) item["artist"] = response.xpath("//meta[@property='og:title']/@content").extract()[0].split(":")[0] print(item["artist"]) item["song"] = response.xpath("//div[@id='song-header-title']/b/text()").extract()[0] print(item["song"]) # Extract lyrics lyrics = response.xpath("//div[@class='lyricbox']").extract()[0] # Drop non-printable characters printable = set(string.printable) lyrics = filter(lambda x: x in printable, lyrics) item["lyrics"] = clean_html_but_br(lyrics) return item
def parse_content(self, response): item = LyricsItem() title_info_si = response.xpath( '//div[@id="lyricsTitle"]/h2/text()').get() title_info_en = response.xpath( '//div[@id="lyricsTitle"]/h1/text()').get() item['title_si'] = title_info_si.split(' - ')[0] item['artist_si'] = title_info_si.split(' - ')[1] item['title_en'] = title_info_en.split(' - ')[0] item['artist_en'] = title_info_en.split(' - ')[1] lyrics_body = response.xpath('//div[@id="lyricsBody"]/text()').getall() content = '' for line in lyrics_body: content += line.split('\n')[1] + ' ' item['lyrics_raw'] = lyrics_body item['lyrics_content'] = content item['author'] = response.xpath( '//div[@class="lyricsInfo"]/span[@class="infInfo"]/text()').getall( )[1].split(': ')[1] yield item
def parse(self, response): # first part: save wikipedia page html to wikipages directory page = response.url.split("/")[4] page_dirname = 'songpages' filename = '%s.html' % page with open(os.path.join(page_dirname,filename), 'wb') as f: f.write(response.body) self.log('Saved file %s' % filename) # second part: extract text for the item for document corpus item = LyricsItem() item['url'] = response.url item['title'] = response.css('h1::text').extract_first() item['text'] = response.xpath('//div/text()')\ .extract() #tags_list = [response.url.split("/")[2], # response.url.split("/")[3]] #more_tags = [x.lower() for x in remove_stopwords(response.url\ # .split("/")[4].split("_"))] #for tag in more_tags: # tag = re.sub('[^a-zA-Z]', '', tag) # alphanumeric values only # tags_list.append(tag) #item['tags'] = tags_list return item
def parse_dir_contents(self, response): item = LyricsItem() item['url'] = response.url #song temp = response.xpath( "//div[contains(@class, 'site-inner')]//header[contains(@class, 'entry-header')]/h1/text()" ).extract()[0] temp = re.split('\||–|-', temp) item['songName'] = temp[1].strip() #artist temp = response.xpath( "//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-3-6')]//span[contains(@class, 'entry-categories')]/a/text()" ).extract() if len(temp) == 0: item['artist'] = [] else: temp = translate_array(temp) item['artist'] = temp #genre temp = response.xpath( "//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-3-6')]//span[contains(@class, 'entry-tags')]/a/text()" ).extract() if len(temp) == 0: item['genre'] = [] else: temp = translate_array(temp) item['genre'] = temp #lyrics writer temp = response.xpath( "//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-2-6')]//span[contains(@class, 'lyrics')]/a/text()" ).extract() if len(temp) == 0: item['lyricWriter'] = [] else: temp = translate_array(temp) item['lyricWriter'] = temp #music director temp = response.xpath( "//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-2-6')]//span[contains(@class, 'music')]/a/text()" ).extract() if len(temp) == 0: item['musicDirector'] = [] else: temp = translate_array(temp) item['musicDirector'] = temp #key & beat temp = response.xpath( "//div[contains(@class, 'entry-content')]/h3/text()").extract()[0] temp = re.split('\|', temp) try: item['key'] = re.split(':', temp[0])[1].strip() except IndexError: item['key'] = temp[0].strip() item['beat'] = '' try: item['beat'] = re.split(':', temp[1])[1].strip() except: item['beat'] = '' #views try: temp = response.xpath( "//div[contains(@class, 'entry-content')]/div[contains(@class, 'tptn_counter')]/text()" ).extract()[0] temp = int(re.sub('[^0-9,]', "", temp).replace(',', '')) item['views'] = temp except: item['views'] = None #shares try: temp = response.xpath( "//div[contains(@class, 'entry-content')]//div[contains(@class, 'nc_tweetContainer swp_share_button total_shares total_sharesalt')]/span[contains(@class, 'swp_count')]/text()" ).extract()[0] temp = int(re.sub('[^0-9,]', "", temp).replace(',', '')) item['shares'] = temp except: item['shares'] = None #lyrics temp = response.xpath( "//div[contains(@class, 'entry-content')]//pre/text()").extract() temp_lyric = '' new_line_found_1 = True new_line_found_2 = False for line in temp: line_content = (re.sub( "[\da-zA-Z\-—\[\]\t\@\_\!\#\+\$\%\^\&\*\(\)\<\>\?\|\}\{\~\:\∆\/]", "", line)).split('\n') for lline in line_content: if lline == '' or lline.isspace(): if not new_line_found_2: new_line_found_2 = True temp_lyric += '\n' else: new_line_found_1 = False new_line_found_2 = False temp_lyric += lline.strip() if not new_line_found_1: new_line_found_1 = True temp_lyric += '\n' item['lyric'] = temp_lyric yield item
def parse_dir_contents(self, response): global translated_dict item = LyricsItem() # song name temp = response.xpath( "//div[contains(@class, 'site-inner')]//header[contains(@class, 'entry-header')]/h1/text()" ).extract()[0] temp = re.split('\||–|-', temp) item['title'] = temp[1].strip() # artist name temp = response.xpath( "//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-3-6')]//span[contains(@class, 'entry-categories')]/a/text()" ).extract() if len(temp) == 0: item['artist'] = [] else: temp = translate_array(temp) item['artist'] = temp # genre temp = response.xpath( "//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-3-6')]//span[contains(@class, 'entry-tags')]/a/text()" ).extract() if len(temp) == 0: item['genre'] = [] else: temp = translate_array(temp) item['genre'] = temp # lyric writer temp = response.xpath( "//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-2-6')]//span[contains(@class, 'lyrics')]/a/text()" ).extract() if len(temp) == 0: item['lyricist'] = [] else: temp = translate_array(temp) item['lyricist'] = temp # music director temp = response.xpath( "//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-2-6')]//span[contains(@class, 'music')]/a/text()" ).extract() if len(temp) == 0: item['music_by'] = [] else: temp = translate_array(temp) item['music_by'] = temp # no of views try: temp = response.xpath( "//div[contains(@class, 'entry-content')]/div[contains(@class, 'tptn_counter')]/text()" ).extract()[0] temp = int(re.sub('[^0-9,]', "", temp).replace(',', '')) item['views'] = temp except: item['views'] = None # no of shares try: temp = response.xpath( "//div[contains(@class, 'entry-content')]//div[contains(@class, 'nc_tweetContainer swp_share_button total_shares total_sharesalt')]/span[contains(@class, 'swp_count')]/text()" ).extract()[0] temp = int(re.sub('[^0-9,]', "", temp).replace(',', '')) item['shares'] = temp except: item['shares'] = None # lyric temp = response.xpath( "//div[contains(@class, 'entry-content')]//pre/text()").extract() lyrics = '' line_1 = True line_2 = False for line in temp: line_content = (re.sub( "[\da-zA-Z\-—\[\]\t\@\_\!\#\+\$\%\^\&\*\(\)\<\>\?\|\}\{\~\:\∆\/]", "", line)).split('\n') for lline in line_content: if lline == '' or lline.isspace(): if not line_2: line_2 = True lyrics += '\n' else: line_1 = False line_2 = False lyrics += lline.strip() if not line_1: line_1 = True lyrics += '\n' item['lyric'] = lyrics pickle.dump(translated_dict, open('../translated_dict.pickle', 'wb')) yield item