def parse(html): '''return a list of dictionaries describing the stories on the front page''' elements = [] p = PyQuery(html) # 90s markup woohoo! anchors = p('.title:nth-child(3) a:nth-child(1)') for a in anchors: # have to re-wrap here, because PyQuery just exposes internal lxml objects upon getting iterated a = PyQuery(a) subtext = a.closest('tr').next().find('.subtext') if not subtext: # More link continue children = map(PyQuery, subtext.children()) try: span, submitted, comments = children[0], children[1], children[-1] except IndexError: # filter out ads continue comments = comments.text().rpartition(' ')[0] comments = int(comments) if comments else 0 url = a.attr('href') elements.append({ 'pos': len(elements) + 1, 'title': a.text(), 'url': url, 'domain': urlparse(url).netloc.rpartition('www.')[2], 'comments': comments, 'submitter': submitted.text(), 'points': int(span.text().split()[0]), 'id': int(span.attr('id').split('_', 1)[1]), 'ago': submitted[0].tail.split('ago')[0].strip(), }) logging.warning('parsed %s elements', len(elements)) return elements
def get_subforums_infos(self, html): """ Get informations (description, number of topics and posts, ...) about the forums listed on a page """ document = PyQuery(html) idpattern = re.compile(r"/([fc]\d+)-.*") for element in document("a.forumlink"): e = PyQuery(element) match = idpattern.fullmatch(clean_url(e.attr("href"))) if not match: continue oldid = match.group(1) row = e.closest("tr") # Get forum status alt = row("td:nth-of-type(1) img").eq(0).attr("alt") self.forums[oldid].status = 1 if "verrouillé" in alt else 0 # Get subforum description self.forums[oldid].description = row("td:nth-of-type(2) span").eq( 1).html() or "" # TODO : Get subforum icon # Get subforum numbers of topics and posts self.forums[oldid].num_topics = int(row("td").eq(2).text()) self.forums[oldid].num_posts = int(row("td").eq(3).text())
def get_subforums_infos(self, html): """ Get informations (description, number of topics and posts, ...) about the forums listed on a page """ document = PyQuery(html) idpattern = re.compile(r"/([fc]\d+)-.*") for element in document("a.forumlink"): e = PyQuery(element) match = idpattern.fullmatch(clean_url(e.attr("href"))) if not match: continue oldid = match.group(1) row = e.closest("tr") # Get forum status alt = row("td:nth-of-type(1) img").eq(0).attr("alt") self.forums[oldid].status = 1 if "verrouillé" in alt else 0 # Get subforum description self.forums[oldid].description = row("td:nth-of-type(2) span").eq(1).html() or "" # TODO : Get subforum icon # Get subforum numbers of topics and posts self.forums[oldid].num_topics = int(row("td").eq(2).text()) self.forums[oldid].num_posts = int(row("td").eq(3).text())