def find_song_info(self, content): # content = content.decode('utf-8', 'ignore') pattern = u'<meta property="og:title" content="(.*?) 歌詞 \|' title = common.get_first_group_by_pattern(content, pattern) title = common.htmlspecialchars_decode(title) self.title = title pattern = u'<meta property="og:description" content="(.*?)が歌う' artist = common.get_first_group_by_pattern(content, pattern) artist = common.htmlspecialchars_decode(artist) self.artist = artist prefixes = { 'lyricist': u'作詞</dt>', 'composer': u'作曲</dt>', } suffix = '</dd>' for key in prefixes: prefix = prefixes[key] value = common.find_string_by_prefix_suffix(content, prefix, suffix, False) if value: value = common.strip_tags(value).strip() value = common.htmlspecialchars_decode(value) setattr(self, key, value) return True
def parse_song_info(self, url, html): ret = True pattern = '<h1.*?>(.*)</h1>' title = common.get_first_group_by_pattern(html, pattern) if title: title = common.htmlspecialchars_decode(common.strip_tags(title)).strip() self.title = title else: logging.warning('Failed to parse title of url [%s]', url) ret = False patterns = { 'artist': '">([^<]*)</a></span></td></tr>', 'lyricist': 'td>([^<]*)<br></td></tr><tr>', 'composer': 'td>([^<]*)<br></td></tr></table>', } prefix = '<table class="lyric-data">' suffix = '</table>' info_table = common.find_string_by_prefix_suffix(html, prefix, suffix) info_table = info_table.replace('\n', '') for key in patterns: pattern = patterns[key] value = common.get_first_group_by_pattern(info_table, pattern) if value: value = common.htmlspecialchars_decode(common.strip_tags(value)).strip() setattr(self, key, value) else: logging.warning('Failed to get %s', key) ret = False return ret
def find_song_info(self, url): ret = True resp = common.get_url_content(url) encoding = 'sjis' html = resp.decode(encoding, 'ignore') prefix = '<table border=0 cellpadding=0 cellspacing=5>' suffix = '</td></table>' infoString = common.get_string_by_start_end_string(prefix, suffix, html) self.title = common.strip_tags( common.get_string_by_start_end_string('<td>', '</td>', infoString) ) self.artist = common.strip_tags( common.get_string_by_start_end_string('<td><a href=', '</a></td>', infoString) ) prefix = '<table border=0 cellpadding=0 cellspacing=0>' suffix = '</td></table>' lyricAndMusic = common.get_string_by_start_end_string(prefix, suffix, infoString) pattern = u'作詞 : (.*)<br>' self.lyricist = common.get_first_group_by_pattern(lyricAndMusic, pattern) pattern = u'作曲 : (.*)</td>' self.composer = common.get_first_group_by_pattern(lyricAndMusic, pattern) return ret
def get_song_id(self, url): if not url: return None pattern = '/lyrics/([0-9]+)' id = common.get_first_group_by_pattern(url, pattern) if not id: pattern = '/kashi/([0-9]+)' id = common.get_first_group_by_pattern(url, pattern) return id
def get_csrf_token(self, html): pattern = '(/lib/pl-lib.js[^"]+)' pl_lib_js = common.get_first_group_by_pattern(html, pattern) if not pl_lib_js: return None url = urlparse.urljoin(site_url, pl_lib_js) r = self.s.get(url) pattern = "'X-CSRF-Token', '(.*?)'" token = common.get_first_group_by_pattern(r.text, pattern) return token
def find_lyric(self, url): pattern = 'item-([0-9]+)\.html' song_id = common.get_first_group_by_pattern(url, pattern) if not song_id: logging.info('Failed to get id of url [%s]', url) return False song_url = 'http://www.kasi-time.com/item_js.php?no=' + song_id data = common.get_url_content(song_url) if not data: logging.info('Failed to get content of url [%s]', song_url) return False lyric = data.decode('utf-8', 'ignore') lyric = lyric.replace("document.write('", "") lyric = lyric.replace("');", "") lyric = lyric.replace("<br>", "\n") lyric = lyric.replace(" ", " ") lyric = common.htmlspecialchars_decode(lyric) lyric = common.unicode2string(lyric) lyric = common.strip_slash(lyric) lyric = lyric.strip() # test for half to full lyric = common.half2full(lyric) self.lyric = lyric return True
def find_lyric(self, url): pattern = 'surl=([^&=]+)' song_id = common.get_first_group_by_pattern(url, pattern) if not song_id: logging.error('Failed to get id of url [%s]', url) return False song_url = 'http://www.animap.jp/kasi/phpflash/flashphp.php?unum=' + song_id data = common.get_url_content(song_url) if not data: logging.error('Failed to get content of url [%s]', song_url) return False prefix = 'test2=' pos = data.find(prefix) if pos == -1: logging.error('Failed to find lyric position of url [%s]', url) return False lyric = data[pos + len(prefix):] lyric = lyric.decode('sjis').strip() # test for half to full lyric = common.half2full(lyric) self.lyric = lyric return True
def find_song_info(self, url): ret = True html = common.get_url_content(url) encoding = 'sjis' html = html.decode(encoding, 'ignore') patterns = { 'title': u'<h2[^>]*>([^<]+)</h2>', 'artist': u'歌手:<h3.*?><a href="/artist/[0-9]+/".*?>(.+?)</a></h3>', 'lyricist': u'作詞:<h4.*?>([^<]+)</h4>', 'composer': u'作曲:<h4.*?>([^<]+)</h4>' } for key in patterns: pattern = patterns[key] value = common.get_first_group_by_pattern(html, pattern) if not value: logging.info('Failed to get %s of url [%s]', key, url) ret = False else: value = common.unicode2string(common.strip_tags(value)) setattr(self, key, value) return ret
def find_song_info(self, content): prefix = "<div id='lyricBlock'>" suffix = '</table>' info_block = common.find_string_by_prefix_suffix(content, prefix, suffix, False) prefix = '<h2>' suffix = '</h2>' title = common.find_string_by_prefix_suffix(info_block, prefix, suffix, False) self.title = common.htmlspecialchars_decode(common.unicode2string(title)) patterns = { 'artist': u'>歌:(.*?)</td>', 'lyricist': u'>作詞:(.*?)</td>', 'composer': u'>作曲:(.*?)</td>' } for key in patterns: pattern = patterns[key] value = common.get_first_group_by_pattern(info_block, pattern) if value: value = common.strip_tags(common.htmlspecialchars_decode(value)).strip() setattr(self, key, value) else: logging.debug('Failed to get %s, pattern: %s' % (key, pattern, )) return True
def find_song_info(self, url): ret = True html = common.get_url_content(url) encoding = 'euc_jp' html = html.decode(encoding, 'ignore') patterns = { 'title': 'title', 'artist': 'artist', 'lyricist': 'sakusi', 'composer': 'sakyoku', } for key in patterns: key_for_pattern = patterns[key] pattern = u'<INPUT type="hidden" name=%s value="([^"]*)">' % (key_for_pattern, ) value = common.get_first_group_by_pattern(html, pattern) if not value: logging.info('Failed to get %s of url [%s]', key, url) ret = False else: value = common.htmlspecialchars_decode(value).strip() setattr(self, key, value) return ret
def get_xml_parameters(self, url): bytes = common.get_url_content(url) pattern = "query +: +'([^']+)'" query = common.get_first_group_by_pattern(bytes, pattern) return query
def find_song_info(self, html): ret = True prefix = '<font color="#FFFFFF"><b>' suffix = "</b></font>" title = common.strip_tags(common.find_string_by_prefix_suffix(html, prefix, suffix)).strip() if title: self.title = title prefix = '<font size="-1" color="#FFFFFF">' suffix = "</font>" info = common.find_string_by_prefix_suffix(html, prefix, suffix) info = re.sub(" +", "", info) info = info.replace("\r", "").replace("\n", "") patterns = {"artist": u"歌:(.*?)/", "lyricist": u"詞:(.*?)/", "composer": u"曲:(.*?)<"} for key in patterns: pattern = patterns[key] value = common.get_first_group_by_pattern(info, pattern) if value: setattr(self, key, value) else: logging.info("Failed to get %s from info %s", key, info) return False return ret
def get_lyric_id(self, url): pattern = 'lyrics/([0-9]+)' lyric_id = common.get_first_group_by_pattern(url, pattern) if not lyric_id: logging.error('Failed to parse id of lyric from url') return False return lyric_id
def find_song_info(self, html): pattern = '<meta property="og:title" content="([^"]+)" />' ogTitle = common.get_first_group_by_pattern(html, pattern) artist, title = ogTitle.split(':') self.title = title self.artist = artist return True
def find_lyric(self, url): pattern = '/[a-z]+/([0-9]+)/' song_id = common.get_first_group_by_pattern(url, pattern) if not song_id: # try old pattern # http://www.uta-net.com/user/phplib/view_0.php?ID=17248 pattern = 'ID=([0-9]+)' song_id = common.get_first_group_by_pattern(url, pattern) if not song_id: logging.info('Failed to get id of url [%s]', url) return False showkasi_pattern = 'http://www.uta-net.com/user/phplib/swf/showkasi.php?ID=%s&WIDTH=530&HEIGHT=810' song_url = showkasi_pattern % (song_id, ) data = common.get_url_content(song_url) if not data: logging.info('Failed to get content of url [%s]', song_url) return False prefix = '<\0\0' suffix = '\0' lyric = common.find_string_by_prefix_suffix(data, prefix, suffix, False) if not lyric: logging.error('Failed to get lyric of url [%s]', url) return False lyric = unicode(lyric, 'utf8') lyric = lyric.strip() # test for half to full lyric = common.half2full(lyric) self.lyric = lyric return True
def parse_song_info(self, html): pattern = u'<title>(.+?) 歌詞 / .*</title>' self.title = common.get_first_group_by_pattern(html, pattern) pattern = u"<div class='artists'>歌:(.*) 作詞:(.*) 作曲:(.*)</div>" matches = common.get_matches_by_pattern(html, pattern) if not matches: return False self.artist = matches.group(1) self.lyricist = matches.group(2) self.composer = matches.group(3) return True
def find_song_info(self, content): pattern = 'og:description" content="(.*)"' og_desc = common.get_first_group_by_pattern(content, pattern) if og_desc: pattern = u'(.*?)「(.*?)」' matches = common.get_matches_by_pattern(og_desc, pattern) if matches: artist = matches.group(1) artist = artist.replace(u'歌詞サーチ ', '') self.artist = artist self.title = matches.group(2) else: logging.debug('og desc: %s' % (og_desc)) prefix = '="lyrics_info_text"' suffix = '</div>' info_text = common.find_string_by_prefix_suffix(content, prefix, suffix, False) if not info_text: logging.info('Failed to find lyrics info text') one_line = info_text.replace('\n', '') patterns = { 'lyricist': u'>作詞</p><p class="info_detail">(.*?)</p>', 'composer': u'>作曲</p><p class="info_detail">(.*?)</p>', } for key in patterns: pattern = patterns[key] value = common.get_first_group_by_pattern(one_line, pattern) if value: value = common.strip_tags(common.htmlspecialchars_decode(value)).strip() setattr(self, key, value) else: logging.debug('Failed to get %s, pattern: %s' % (key, pattern, )) return True
def find_song_info(self, html): ret = True patterns = { 'title': '"musicSongTitle":"(.*?)"', 'artist': '"musicArtistName":"(.*?)"' } for key in patterns: pattern = patterns[key] value = common.get_first_group_by_pattern(html, pattern) if value: setattr(self, key, value) return ret
def parse_lyric(self, html): prefix = '<PRE>' suffix = '</PRE>' raw_lyric = common.find_string_by_prefix_suffix(html, prefix, suffix, False) if not raw_lyric: logging.info('Failed to parse lyric from html [%s]', html) return False raw_lyric = raw_lyric.strip() pos = raw_lyric.find('\n\n') if pos < 0: logging.info('Failed to find two consecutive newlines') return False self.title = raw_lyric[0:pos] raw_lyric = raw_lyric[pos+2:] pos = raw_lyric.find('\n\n') if pos < 0: logging.info('Failed to find two consecutive newlines') return False info = raw_lyric[0:pos] self.lyric = raw_lyric[pos+2:] patterns = { 'artist': u'歌:(.+)', 'lyricist': u'作詞:(.+?)/', 'composer': u'作曲:(.+?)/', 'arranger': u'編曲:(.+?)/', } for key in patterns: pattern = patterns[key] value = common.get_first_group_by_pattern(info, pattern) if not value: logging.info('Failed to get %s of url [%s]', key, url) else: value = common.htmlspecialchars_decode(value).strip() setattr(self, key, value) return True
def find_song_info(self, url): ret = True html = common.get_url_content(url) encoding = 'utf-8' html = html.decode(encoding, 'ignore') pattern = '<h1>(.*)</h1>' value = common.get_first_group_by_pattern(html, pattern).strip() if value: self.title = value else: logging.error('Failed to find title of url [%s]', url) ret = False prefix = '<div class="person_list">' suffix = '</div>' info_table = common.find_string_by_prefix_suffix(html, prefix, suffix) patterns = { 'artist': u'歌手', 'lyricist': u'作詞者', 'composer': u'作曲者', 'arranger': u'編曲者', } for key in patterns: pattern = patterns[key] prefix = u'<th>%s</th>' % (pattern) suffix = '</td>' value = common.find_string_by_prefix_suffix(info_table, prefix, suffix, False) if not value: continue value = common.strip_tags(value).strip() if value: setattr(self, key, value) return ret
def find_lyric(self, url): pattern = '\?([0-9]+)' song_id = common.get_first_group_by_pattern(url, pattern) if not song_id: logging.error('Failed to get id of url [%s]', url) return False params = self.get_hidden_params() query = '%s%s&time=%s' % (params['query_prefix'], song_id, time.localtime(), ) logging.debug('query:%s' % (query, )) post_url = 'http://www.kashinavi.com/cgi-bin/kashi.cgi' resp = common.get_url_content(post_url, query) if not resp: logging.error('Failed to get content of url [%s], query [%s]', post_url, query) return False raw_lyric = resp.decode('utf-8', 'ignore') # if parsing rule changed, return debug info if raw_lyric.find(u'歌詞ナビTOPページより') > 0: self.lyric = ''' Site rule changed! Please contact franklai LoadVars::%s::myLoadVars ''' % (params['middle_value']) return True # else remove the useless part and return lyric front_str = 'kashi=' start = raw_lyric.find(front_str) + len(front_str) lyric = raw_lyric[start:] lyric = lyric.strip() self.lyric = lyric return True
def get_song_id(self, url): pattern = '/song/([0-9]+)' song_id = common.get_first_group_by_pattern(url, pattern) return song_id