def parsing(cls, url, configs=None, outers=None, document=None, check=None): """解析接口 :param url:网址 :type url:str :param configs:一般配置 :type configs:dict :param outers:外链配置 :type outers:dict :param document:网址的内容 :type document:str :param check:要检测的字段,若没有抽取到该字段,则返回空的结果 :type check:list :return:是否有可用的域名配置(bool),返回的解析结果(dict) :rtype: bool, dict """ if check is None: check = ["content"] if not document: document = download_page(url) flag = False for config in Matcher.match(url, configs=configs, outers=outers): flag = True result = cls.extract(url, document, config) if all([result[field] for field in check]): return flag, result return flag, Formatter.format_result()
def retrieve_incomes(movie_id, full_week=False, use_cumes=False): '''Retrieve income data for movie_ids from BoxOfficeMojo.''' def parse_incomes(page, full_week, use_cumes): '''Parse BoxOfficeMojo page for movie incomes.''' if use_cumes: if full_week: pattern = r'<font color="#800080" size="2">\$([0-9,]*?) / ([0-9]*?)</font>' else: pattern = r'<td align="right"><font size="2">\$([0-9,]*?)</font></td><td align="center"><font size="2">([0-9]*?)</font></td></tr>' incomes = [g[0] for g in re.findall(pattern, page)] else: if full_week: pattern = r'<font color="#000080">\$([0-9,]*?)</font>' incomes = re.findall(pattern, page) else: pattern = r'<td align="center"><font size="2">([0-9\-]*?)</font></td><td align="right"><font size="2">\$([0-9,]*?)</font></td>' incomes = [g[1] for g in re.findall(pattern, page)] return [int(g.replace(",","")) for g in incomes] url_template = "http://www.boxofficemojo.com/movies/?page=%s&id=%s.htm" if full_week: url = url_template % ("daily", movie_id) else: url = url_template % ("weekend", movie_id) resp = download_page(url) if resp is None: logging.warning("BoxOfficeMojo movie not found: %s" % url) else: return {'id': movie_id, 'values': parse_incomes(resp, full_week, use_cumes)}
def parse_friends_URL(url, only_id=True): '''Return list of friends and, if not only_id, page count and is artist.''' def parse_friends_seed_name(friends_page): # Unused option to load a profile name from the friends page # Unfortunately, depends on the page langauge, and works only by adding # cookie manager to open us.myspace.com, or else directly the cookie MSCulture # with IPCulture=en-US using ClientCookie in download.py seed_name_pattern = '<span class="feature_headtext"> amigos de(.*?)</span>' try: return re.search(seed_name_pattern, friends_page).group(1) except: return False def parse_friends_list(friends_page): '''Returns the list of friend IDs in the page.''' friends = re.findall(friends_list_pattern, friends_page) # Remove deleted accounts looking for empty friend[2] matches = [re.search(' title="(.*?)"', f[2]) for f in friends] return [{"id": int(friends[i][0]), "name": m.group(1), "url": friends[i][1]} for \ i, m in enumerate(matches) if m is not None] def parse_friends_page_count(friends_page): '''Returns the number of friends pages in the page.''' match = re.search(page_count_pattern, friends_page) try: return int(filter(is_digit, match.group(1))) except: return 1 def parse_friends_is_artist(friends_page): '''Returns True if the page belongs to a musician.''' return friends_page.find(is_artist_pattern) > 0 resp = download_page(url) friends = count_pages = is_artist = None if resp is None: logging.debug("URL error on: %s" % url) else: friends = parse_friends_list(resp) if not only_id: count_pages = parse_friends_page_count(resp) is_artist = parse_friends_is_artist(resp) # name = parse_friends_seed_name(resp) if only_id: return friends else: return friends, count_pages, is_artist
def parse_profile_URL(url, only_artists=True): '''Return detail of profile. As for now, implemented ONLY for artists.''' def parse_profile_id(profile_page): '''Returns the ID of a profile from the profile page.''' match = re.search(profile_id_pattern, profile_page) try: return int(filter(is_digit, match.group(1))) except: return None def parse_profile_name(profile_page): '''Returns the name of a profile from the profile page.''' try: return re.search(profile_name_pattern, profile_page).group(1) except: return None def parse_profile_suffix(profile_page): '''Returns the URL suffix from the profile page.''' try: return re.search(profile_url_pattern, profile_page).group(1) except: return None # If it's not a musician, then it's # <span class="urlLink"><a href="http://www.myspace.com/bellatopa" title="Perfil MySpace para Antonella" class="url">www.myspace.com/bellatopa</a></span> resp = download_page(url) if resp is None: logging.debug("URL error on: %s" % url) return None id = parse_profile_id(resp) if id is None: # TO ADD: and only_artists: logging.debug("Profile is not an artist %s" % url) return None name = parse_profile_name(resp) url = parse_profile_suffix(resp) return {"id": id, "name": name, "url": url}
# -*- coding = utf-8 -*- import re import download url='http://www.163.com/' html=download.download_page(url) urls=re.findall('<a href="(.*?)">(.*?)</a>')