end_str = conf['block_end'] sub_content = '' if special_str in content: start_index = content.index(special_str) sub_content = content[start_index + len(special_str):] end_index = sub_content.index(end_str) sub_content = sub_content[0:end_index] #sub_content = re.sub(r'</?\w+[^>]*>',' ', sub_content) ### end_mark = conf['end_mark'] song_mark = conf['song_mark'] singer_mark = conf['singer_mark'] find_index = 0 sub_content = tools.str_replace(sub_content) i = 0 while song_mark in sub_content: #{ song = '' singer = '' if 'simple_parse' in conf and conf['simple_parse']: start_index = sub_content.index(song_mark) sub_content = sub_content[start_index + len(song_mark):] end_index = sub_content.index(end_mark) html = sub_content[0:end_index] song_singer = callback(html, conf['callback']) #tools.debug(song_singer, 1) song = song_singer['song']
def xiami_parse_detail(self, html, module, conf, info_obj, fp_obj, logger): # { # tools.debug(html_section) # tools.debug(info_obj) # tools.debug(fp_obj, 1) parser = HTMLParser.HTMLParser() dp_fp = fp_obj["dp_fp"] dd_fp = fp_obj["dd_fp"] url = info_obj["url"] logger.info("Start parse url for %s, url: %s" % (module, url)) # tools.debug(url) html = tools.str_replace(html) detail_data = { "url": url, "album": info_obj["album"], "artist": "", "language": "", "company": "", "publish_time": "", "album_type": "", "album_intro": "", } start = '<td valign="top"' end = "</td>" start_len = len(start) end_len = len(end) parse = ["artist", "language", "company", "publish_time", "album_type"] for tag in parse: # { if start not in html: break s_index = html.find(start) html = html[s_index:] e_index = html.find(end) sub_html = html[0 : e_index + end_len] data = tools.strip_html_tag(sub_html) # tools.debug(data) detail_data[tag] = data html = html[e_index + end_len :] # } end for tag # tools.debug(detail_data, 1) # album intro start = '<div id="album_intro"' end = '<div class="album_intro_toggle' start_len = len(start) end_len = len(end) if start in html: s_index = html.find(start) e_index = html.find(end) sub_html = html[s_index:e_index] data = tools.strip_html_tag(sub_html) # { try: data = parser.unescape(data) except (UnicodeDecodeError, UnicodeEncodeError), e: logger.info("Parse data error. [Exception]: %s" % (e)) # } detail_data["album_intro"] = data
def xiami_parse_detail(self, html, module, conf, info_obj, fp_obj, logger): #{ #tools.debug(html_section) #tools.debug(info_obj) #tools.debug(fp_obj, 1) parser = HTMLParser.HTMLParser() dp_fp = fp_obj['dp_fp'] dd_fp = fp_obj['dd_fp'] url = info_obj['url'] logger.info('Start parse url for %s, url: %s' % (module, url)) #tools.debug(url) html = tools.str_replace(html) detail_data = { 'url': url, 'album': info_obj['album'], 'artist': '', 'language': '', 'company': '', 'publish_time': '', 'album_type': '', 'album_intro': '', } start = '<td valign="top"' end = '</td>' start_len = len(start) end_len = len(end) parse = ['artist', 'language', 'company', 'publish_time', 'album_type'] for tag in parse: #{ if start not in html: break s_index = html.find(start) html = html[s_index:] e_index = html.find(end) sub_html = html[0:e_index + end_len] data = tools.strip_html_tag(sub_html) #tools.debug(data) detail_data[tag] = data html = html[e_index + end_len:] #} end for tag #tools.debug(detail_data, 1) # album intro start = '<div id="album_intro"' end = '<div class="album_intro_toggle' start_len = len(start) end_len = len(end) if start in html: s_index = html.find(start) e_index = html.find(end) sub_html = html[s_index:e_index] data = tools.strip_html_tag(sub_html) #{ try: data = parser.unescape(data) except (UnicodeDecodeError, UnicodeEncodeError), e: logger.info('Parse data error. [Exception]: %s' % (e)) #} detail_data['album_intro'] = data
def xiami_parse_url(self, html, module, conf, fp, logger): # { logger.info("Start parse url for %s" % module) start_tag = "<li>" end_tag = "</li>" deep_url = "http://www.xiami.com%s" html = tools.str_replace(html) while start_tag in html: # { start_index = html.find(start_tag) end_index = html.find(end_tag) sub_html = html[start_index + len(start_tag) : end_index] # tools.debug(sub_html, 1) parse_info = {"album": "", "artist": "", "url": "", "year": ""} # parse deep page url url = "#" u_start = 'href="' u_end = '"' if u_start in sub_html: u_s_index = sub_html.find(u_start) sub_html = sub_html[u_s_index + len(u_start) :] u_e_index = sub_html.find(u_end) url = sub_html[0:u_e_index] sub_html = sub_html[u_e_index + len(u_end) :] if "/" == url[0]: url = deep_url % url else: url = "/%s" % url url = deep_url % url parse_info["url"] = url # parse album name, singer and year parse = [ {"name": "album", "start": '<a class="song"', "end": "</a>"}, {"name": "singer", "start": '<a class="singer"', "end": "</a>"}, {"name": "year", "start": '<p class="year"', "end": "</p>"}, ] for obj in parse: # { name = obj["name"] start = obj["start"] end = obj["end"] data = "" start_len = len(start) end_len = len(end) if start in sub_html: s_index = sub_html.find(start) sub_html = sub_html[s_index:] e_index = sub_html.find(end) tmp_html = sub_html[0 : e_index + end_len] data = tools.strip_html_tag(tmp_html) sub_html = sub_html[e_index + end_len :] parse_info[name] = data # } # album singer url year w_str = "%s\t%s\t%s\t%s\n" % ( parse_info["album"], parse_info["singer"], parse_info["url"], parse_info["year"], ) # tools.debug(w_str, 1) try: fp.write(w_str) except: logger.warn("Write data fail for %s." % module) html = html[end_index + len(end_tag) :] # } end while logger.info("parse url is completed for %s." % module) return 0
def xiami_parse_url(self, html, module, conf, fp, logger): #{ logger.info('Start parse url for %s' % module) start_tag = '<li>' end_tag = '</li>' deep_url = 'http://www.xiami.com%s' html = tools.str_replace(html) while start_tag in html: #{ start_index = html.find(start_tag) end_index = html.find(end_tag) sub_html = html[start_index + len(start_tag):end_index] #tools.debug(sub_html, 1) parse_info = {'album': '', 'artist': '', 'url': '', 'year': ''} # parse deep page url url = '#' u_start = 'href="' u_end = '"' if u_start in sub_html: u_s_index = sub_html.find(u_start) sub_html = sub_html[u_s_index + len(u_start):] u_e_index = sub_html.find(u_end) url = sub_html[0:u_e_index] sub_html = sub_html[u_e_index + len(u_end):] if '/' == url[0]: url = deep_url % url else: url = '/%s' % url url = deep_url % url parse_info['url'] = url # parse album name, singer and year parse = [ { 'name': 'album', 'start': '<a class="song"', 'end': '</a>', }, { 'name': 'singer', 'start': '<a class="singer"', 'end': '</a>', }, { 'name': 'year', 'start': '<p class="year"', 'end': '</p>', }, ] for obj in parse: #{ name = obj['name'] start = obj['start'] end = obj['end'] data = '' start_len = len(start) end_len = len(end) if start in sub_html: s_index = sub_html.find(start) sub_html = sub_html[s_index:] e_index = sub_html.find(end) tmp_html = sub_html[0:e_index + end_len] data = tools.strip_html_tag(tmp_html) sub_html = sub_html[e_index + end_len:] parse_info[name] = data #} # album singer url year w_str = '%s\t%s\t%s\t%s\n' % ( parse_info['album'], parse_info['singer'], parse_info['url'], parse_info['year']) #tools.debug(w_str, 1) try: fp.write(w_str) except: logger.warn('Write data fail for %s.' % module) html = html[end_index + len(end_tag):] #} end while logger.info('parse url is completed for %s.' % module) return 0