def _parse_html(self, doc): user = dict() header = doc('#user_header') user['name'] = header.children('#user_sub').children( '.image').children('.img').children('img').attr('alt') info = header('ul.info').children() for child in info[1:]: text = PyQuery(child).text() if text.endswith('cm'): user['height'] = text elif text in {'MEN', 'WOMEN', 'KIDS'}: user['sex'] = text elif text.endswith(u'嵗'): user['age'] = text elif u'髮' in text: user['hairstyle'] = text else: user['location'] = text brands = [_.text() for _ in header('.favorite')('ul').items('li')] if brands: user['brandLike'] = '|'.join(brands) use = [_.text() for _ in doc('#gbl_related_link')('ul').items('li')] if use: user['brandUse'] = '|'.join(use) return user
def get_diy_links(host, html, pattern, suffix): d = PyQuery(html) pattern = '{} a'.format(pattern) try: link_list = d(pattern) except Exception: yield else: for link in link_list: href = PyQuery(link).attr('href').encode('utf-8') loc = urlparse.urlparse(href).netloc if not loc: href = host + href if href.endswith(suffix): yield href