示例#1
0
 def parse_lyricsPage(self, response):
     item = LyricsItem()
     item['artist'] = response.xpath(
         '//*[@class="text_artist"]/a/text()').extract()
     item['text'] = response.xpath(
         '//*[@class="lyrics_container"]/div/p/a/text()').extract()
     yield item
示例#2
0
 def parse_getsong(self,response):
     global how_many_songs
     singer = ''.join(response.xpath(u'//*[@id="Tb3"]/a[3]/text()').extract()).encode('utf-8')
     first = 1
     last = ''
     for i in response.selector.xpath(u'//*[contains(concat( " ", @class, " " ), concat( " ", "ha0", " " ))]//dd')[3:]:
         this = "".join(i.xpath('span[2]/a[1]/text()').extract()).encode('utf-8')
         no = "".join(i.xpath('span[2]/a[2]/font').extract()).encode('utf-8')
         year = ("".join(i.xpath('span[5]/text()').extract()))[0:4]
         if this != "" and (not no) and len(year)==4 :
             year = int(year)
             if year >= 2003:
                 print this
                 print year
                 if first:
                     how_many_songs+=1
                     last = this
                     first-=1
                     f.write(str(how_many_songs)+','+this+','+singer+','+'\n')
                     url = 'https://mojim.com'+str(i.xpath('span[2]/a[1]/@href').extract()[0])
                     print url
                     item = LyricsItem()
                     item['number'] = str(how_many_songs)
                     request = scrapy.Request(url=url,callback=self.parse_getlyrics)
                     request.meta['item'] = item
                     yield request
                 else:
                     if(last==this):
                         continue
                     else:
                         last = this
                         how_many_songs+=1
                         f.write(str(how_many_songs)+','+this+','+singer+','+'\n')
                         url = 'https://mojim.com'+str(i.xpath('span[2]/a[1]/@href').extract()[0])
                         print url
                         item = LyricsItem()
                         item['number'] = str(how_many_songs)
                         request = scrapy.Request(url=url,callback=self.parse_getlyrics)
                         request.meta['item'] = item
                         yield request
示例#3
0
    def parse_dir_contents(self, response):

        item = LyricsItem()

	# url
        item['url'] = response.url

        # song title
        title = response.xpath("//div[contains(@class, 'site-inner')]//header[contains(@class, 'entry-header')]/h1/text()").extract()[0]
        item['title'] = re.split('[\–|-]', title)[1].strip()
        
         # artist name
        item['artist']   = response.xpath("//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-3-6')]//span[contains(@class, 'entry-categories')]/a/text()").extract()
            
        # lyricist
        item['lyricist'] = response.xpath("//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-2-6')]//span[contains(@class, 'lyrics')]/a/text()").extract()
            
        # musicComposer
        item['musicComposer'] = response.xpath("//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-2-6')]//span[contains(@class, 'music')]/a/text()").extract()
            
        # genre
        item['genre'] = response.xpath("//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-3-6')]//span[contains(@class, 'entry-tags')]/a/text()").extract()
            
        # views
        views = response.xpath("//div[contains(@class, 'entry-content')]/div[contains(@class, 'tptn_counter')]/text()").extract()[0]
        item['views']  = int(re.sub('[^0-9,]', "", views).replace(',', ''))
        
        #shares
        shares = response.xpath("//div[contains(@class, 'entry-content')]//div[contains(@class, 'nc_tweetContainer swp_share_button total_shares total_sharesalt')]/span[contains(@class, 'swp_count')]/text()").extract()[0]
        item['shares'] = int(re.sub('[^0-9,]', "", shares).replace(',', ''))
   
        # lyrics
        lyrics = response.xpath("//div[contains(@class, 'entry-content')]//pre/text()").extract()
                
        song = ''
        check = False
        
        for line in lyrics:
            lines = (re.sub("[\da-zA-Z\-—\[\]\(\)\}\{\@\_\!\#\+\$\%\^\&\*\<\>\?\|\~\:\∆\/]", "", line)).split('\n')
            for line_l in lines:
                if not(line_l.isspace() or line_l == ""):
                    song += line_l.strip()
                    check = True
                else:
                    if check:
                    	song += '\\n'
                    	check = False
        		
        item['lyrics'] = song

        yield item
示例#4
0
    def parse_dir_contents(self, response):

        try:
            genre_text = response.xpath(
                "//body//div[@id='mw-pages']//h2//span//text()").extract()
            genre_text = genre_text[0]
            genre = genre_text.split("Genre/")[-1].strip('"')
        except:
            genre_text = ""

        try:
            artists = response.xpath(
                '//body//div[@id="mw-pages"]//div[@class="mw-content-ltr"]')
            artists = artists.xpath('//tr//ul//li//a')

        except:

            artists = []
            item = LyricsItem()
            item["url"] = response.urljoin(url)
            item["genre"] = ""
            item["lyrics"] = ""
            yield item

        if artists:

            for sel in artists:

                item = LyricsItem()
                url = sel.xpath('@href').extract()[0]
                url = response.urljoin(url)

                title = sel.xpath('@title').extract()[0]
                item["url"] = url
                item["title"] = title
                item["genre"] = genre

                yield item
示例#5
0
    def parse_item(self, response):
        page = response.url.split("/")[-1]
        page_dirname = 'songpages'
        filename = '%s.html' % page
        with open(os.path.join(page_dirname, filename), 'wb') as f:
            f.write(response.body)
        self.log('Saved file %s' % filename)

        # second part: extract text for the item for document corpus
        item = LyricsItem()
        item['url'] = response.url
        item['title'] = response.css('h1::text').extract_first()
        item['text'] = response.xpath('//div/text()').extract()
        return item
示例#6
0
 def parse_lyrics(self,text,output):
     item = LyricsItem()
     for line in re.finditer('](.*)[::](.*)\\n',text):
         if len(line.group(2))>8:
             continue
         k = line.group(1).strip()
         if len(k)>8:
             continue
         if text.count(k)>1 and k not in ['作词','作曲','编曲']:
             continue
         for v in re.split('[、/]',line.group(2)):
             if v.strip()!='':
                 output[k] = v.strip()
     if len(output)>1:
         item['data'] = output
         yield item
示例#7
0
    def parse_page(self, response):
        artist = response.xpath("//div[@class='artist-header content-header row']/div/h1/text()").extract()[0].strip()

        top_list_song = response.xpath("//table")

        songList = top_list_song.xpath(".//a/@href").extract()
        for url in songList[0:min(self.settings["SONGS_PER_ARTIST"], len(songList) - 1)]:
            print("URL " + url)

        for url in songList[0:min(self.settings["SONGS_PER_ARTIST"], len(songList) - 1)]:
            item = LyricsItem()
            item["lyricsURL"] = url
            item["artist"] = artist

            req = Request(url, callback=self.parse_lyrics)
            req.meta["item"] = item

            yield req
示例#8
0
    def parse_page(self, response):
        item = LyricsItem()
        item["lyricsURL"] = response.url
        print("PARSING URL: " + response.url)
        item["artist"] = response.xpath("//meta[@property='og:title']/@content").extract()[0].split(":")[0]
        print(item["artist"])
        item["song"] = response.xpath("//div[@id='song-header-title']/b/text()").extract()[0]
        print(item["song"])

        # Extract lyrics
        lyrics = response.xpath("//div[@class='lyricbox']").extract()[0]

        # Drop non-printable characters
        printable = set(string.printable)
        lyrics = filter(lambda x: x in printable, lyrics)

        item["lyrics"] = clean_html_but_br(lyrics)

        return item
示例#9
0
    def parse_content(self, response):
        item = LyricsItem()

        title_info_si = response.xpath(
            '//div[@id="lyricsTitle"]/h2/text()').get()
        title_info_en = response.xpath(
            '//div[@id="lyricsTitle"]/h1/text()').get()
        item['title_si'] = title_info_si.split(' - ')[0]
        item['artist_si'] = title_info_si.split(' - ')[1]
        item['title_en'] = title_info_en.split(' - ')[0]
        item['artist_en'] = title_info_en.split(' - ')[1]

        lyrics_body = response.xpath('//div[@id="lyricsBody"]/text()').getall()
        content = ''
        for line in lyrics_body:
            content += line.split('\n')[1] + ' '

        item['lyrics_raw'] = lyrics_body
        item['lyrics_content'] = content
        item['author'] = response.xpath(
            '//div[@class="lyricsInfo"]/span[@class="infInfo"]/text()').getall(
            )[1].split(': ')[1]

        yield item
示例#10
0
    def parse(self, response):
        # first part: save wikipedia page html to wikipages directory
        page = response.url.split("/")[4]
        page_dirname = 'songpages'
        filename = '%s.html' % page
        with open(os.path.join(page_dirname,filename), 'wb') as f:
            f.write(response.body)
        self.log('Saved file %s' % filename) 

        # second part: extract text for the item for document corpus
        item = LyricsItem()
        item['url'] = response.url
        item['title'] = response.css('h1::text').extract_first()
        item['text'] = response.xpath('//div/text()')\
                           .extract()                                                             
        #tags_list = [response.url.split("/")[2],
        #             response.url.split("/")[3]]
        #more_tags = [x.lower() for x in remove_stopwords(response.url\
        #               	    .split("/")[4].split("_"))]
        #for tag in more_tags:
        #    tag = re.sub('[^a-zA-Z]', '', tag)  # alphanumeric values only  
        #    tags_list.append(tag)
        #item['tags'] = tags_list                 
        return item 
示例#11
0
    def parse_dir_contents(self, response):
        item = LyricsItem()

        item['url'] = response.url

        #song
        temp = response.xpath(
            "//div[contains(@class, 'site-inner')]//header[contains(@class, 'entry-header')]/h1/text()"
        ).extract()[0]
        temp = re.split('\||–|-', temp)
        item['songName'] = temp[1].strip()

        #artist
        temp = response.xpath(
            "//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-3-6')]//span[contains(@class, 'entry-categories')]/a/text()"
        ).extract()
        if len(temp) == 0:
            item['artist'] = []
        else:
            temp = translate_array(temp)
            item['artist'] = temp

        #genre
        temp = response.xpath(
            "//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-3-6')]//span[contains(@class, 'entry-tags')]/a/text()"
        ).extract()
        if len(temp) == 0:
            item['genre'] = []
        else:
            temp = translate_array(temp)
            item['genre'] = temp

        #lyrics writer
        temp = response.xpath(
            "//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-2-6')]//span[contains(@class, 'lyrics')]/a/text()"
        ).extract()
        if len(temp) == 0:
            item['lyricWriter'] = []
        else:
            temp = translate_array(temp)
            item['lyricWriter'] = temp

        #music director
        temp = response.xpath(
            "//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-2-6')]//span[contains(@class, 'music')]/a/text()"
        ).extract()
        if len(temp) == 0:
            item['musicDirector'] = []
        else:
            temp = translate_array(temp)
            item['musicDirector'] = temp

        #key & beat
        temp = response.xpath(
            "//div[contains(@class, 'entry-content')]/h3/text()").extract()[0]
        temp = re.split('\|', temp)
        try:
            item['key'] = re.split(':', temp[0])[1].strip()
        except IndexError:
            item['key'] = temp[0].strip()
            item['beat'] = ''
        try:
            item['beat'] = re.split(':', temp[1])[1].strip()
        except:
            item['beat'] = ''

        #views
        try:
            temp = response.xpath(
                "//div[contains(@class, 'entry-content')]/div[contains(@class, 'tptn_counter')]/text()"
            ).extract()[0]
            temp = int(re.sub('[^0-9,]', "", temp).replace(',', ''))
            item['views'] = temp
        except:
            item['views'] = None

        #shares
        try:
            temp = response.xpath(
                "//div[contains(@class, 'entry-content')]//div[contains(@class, 'nc_tweetContainer swp_share_button total_shares total_sharesalt')]/span[contains(@class, 'swp_count')]/text()"
            ).extract()[0]
            temp = int(re.sub('[^0-9,]', "", temp).replace(',', ''))
            item['shares'] = temp
        except:
            item['shares'] = None

        #lyrics
        temp = response.xpath(
            "//div[contains(@class, 'entry-content')]//pre/text()").extract()
        temp_lyric = ''
        new_line_found_1 = True
        new_line_found_2 = False

        for line in temp:
            line_content = (re.sub(
                "[\da-zA-Z\-—\[\]\t\@\_\!\#\+\$\%\^\&\*\(\)\<\>\?\|\}\{\~\:\∆\/]",
                "", line)).split('\n')

            for lline in line_content:
                if lline == '' or lline.isspace():
                    if not new_line_found_2:
                        new_line_found_2 = True
                        temp_lyric += '\n'
                else:
                    new_line_found_1 = False
                    new_line_found_2 = False
                    temp_lyric += lline.strip()

            if not new_line_found_1:
                new_line_found_1 = True
                temp_lyric += '\n'

        item['lyric'] = temp_lyric

        yield item
示例#12
0
    def parse_dir_contents(self, response):
        global translated_dict

        item = LyricsItem()

        # song name
        temp = response.xpath(
            "//div[contains(@class, 'site-inner')]//header[contains(@class, 'entry-header')]/h1/text()"
        ).extract()[0]
        temp = re.split('\||–|-', temp)
        item['title'] = temp[1].strip()

        # artist name
        temp = response.xpath(
            "//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-3-6')]//span[contains(@class, 'entry-categories')]/a/text()"
        ).extract()
        if len(temp) == 0:
            item['artist'] = []
        else:
            temp = translate_array(temp)
            item['artist'] = temp

        # genre
        temp = response.xpath(
            "//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-3-6')]//span[contains(@class, 'entry-tags')]/a/text()"
        ).extract()
        if len(temp) == 0:
            item['genre'] = []
        else:
            temp = translate_array(temp)
            item['genre'] = temp

        # lyric writer
        temp = response.xpath(
            "//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-2-6')]//span[contains(@class, 'lyrics')]/a/text()"
        ).extract()
        if len(temp) == 0:
            item['lyricist'] = []
        else:
            temp = translate_array(temp)
            item['lyricist'] = temp

        # music director
        temp = response.xpath(
            "//div[contains(@class, 'entry-content')]//div[contains(@class, 'su-column su-column-size-2-6')]//span[contains(@class, 'music')]/a/text()"
        ).extract()
        if len(temp) == 0:
            item['music_by'] = []
        else:
            temp = translate_array(temp)
            item['music_by'] = temp

        # no of views
        try:
            temp = response.xpath(
                "//div[contains(@class, 'entry-content')]/div[contains(@class, 'tptn_counter')]/text()"
            ).extract()[0]
            temp = int(re.sub('[^0-9,]', "", temp).replace(',', ''))
            item['views'] = temp
        except:
            item['views'] = None

        # no of shares
        try:
            temp = response.xpath(
                "//div[contains(@class, 'entry-content')]//div[contains(@class, 'nc_tweetContainer swp_share_button total_shares total_sharesalt')]/span[contains(@class, 'swp_count')]/text()"
            ).extract()[0]
            temp = int(re.sub('[^0-9,]', "", temp).replace(',', ''))
            item['shares'] = temp
        except:
            item['shares'] = None

        # lyric
        temp = response.xpath(
            "//div[contains(@class, 'entry-content')]//pre/text()").extract()
        lyrics = ''
        line_1 = True
        line_2 = False

        for line in temp:
            line_content = (re.sub(
                "[\da-zA-Z\-—\[\]\t\@\_\!\#\+\$\%\^\&\*\(\)\<\>\?\|\}\{\~\:\∆\/]",
                "", line)).split('\n')

            for lline in line_content:
                if lline == '' or lline.isspace():
                    if not line_2:
                        line_2 = True
                        lyrics += '\n'
                else:
                    line_1 = False
                    line_2 = False
                    lyrics += lline.strip()

            if not line_1:
                line_1 = True
                lyrics += '\n'

        item['lyric'] = lyrics

        pickle.dump(translated_dict, open('../translated_dict.pickle', 'wb'))

        yield item