def parse_profile_html(document): """ Parse an LXML document to retrieve the profile data :param document: the LXML document to parse :return: a dictionary representing the profile """ username_elements = document.xpath("//*[@class='author']") registered_elements = document.xpath("//*[@class='registered']") avatar_elements = document.xpath("//*[@class='title']//img") info_elements = document.xpath("//*[@class='info']") userid_elements = document.xpath("//*[@name='userid']") profile = {} if userid_elements: profile["id"] = userid_elements[0].attrib["value"] if username_elements: profile["username"] = username_elements[0].text_content() if registered_elements: profile["registered"] = registered_elements[0].text_content() if avatar_elements: profile["avatar"] = avatar_elements[0].attrib["src"] if avatar_elements[0].attrib["src"] else "" profile["is_newbie"] = profile["avatar"].endswith("/images/newbie.gif") if info_elements: info_text = info_elements[0].text_content() post_count = re.search(r"Post Count(\d+)", info_text) if post_count: profile["post_count"] = post_count.group(1) post_rate = re.search(r"Post Rate([\d\.]+)", info_text) if post_rate: profile["post_rate"] = post_rate.group(1) last_post = re.search(r"Last Post(.+)", info_text) if last_post: profile["last_post"] = last_post.group(1) gender = re.search(r"claims to be a ([-a-z0-9 ]+)", info_text) if gender: profile["gender"] = gender.group(1).lower() if "id" in profile: profile["profile_link"] = http.prepare_url(PROFILE_URL, { "action": "getinfo", "userid": profile["id"]}) elif "username" in profile: profile["profile_link"] = http.prepare_url(PROFILE_URL, {"action": "getinfo", "username": profile["username"]}) return profile
def parse_thread_html(document): """ Parse an LXML document to retrieve the thread data :param document: the LXML document to parse :return: a dictionary representing the thread """ breadcrumbs_elements = document.xpath("//div[@class='breadcrumbs']//a") author_elements = document.xpath("//dt[contains(@class, author)]") last_page_elements = document.xpath("//a[@title='Last page']") if not breadcrumbs_elements: return if not author_elements: return if len(breadcrumbs_elements) < 2: return thread_id = int(breadcrumbs_elements[-1].attrib['href'].rsplit('=', 2)[1]) breadcrumbs = [ e.text_content() for e in breadcrumbs_elements ] thread_title = breadcrumbs[-1] forum_title = breadcrumbs[-2] if author_elements: author = author_elements[0].text_content().strip() else: author = 'Unknown Author' # Handle GBS / FYAD / E/N / etc if ':' in forum_title: forum_title = forum_title.split(':')[0].strip() if forum_title in FORUM_ABBREVS: forum_title = FORUM_ABBREVS[forum_title] if last_page_elements: post_count = int(last_page_elements[0].text_content().split(" ")[0]) else: post_count = 1 posts = {x.attrib['id']: ( x.xpath('.//dt[contains(@class, "author")]')[0].text_content(), x.xpath('.//*[@class="postdate"]')[0].text_content().strip('\n #?'), x.xpath('.//*[@class="postbody"]')[0].text_content().strip()) for x in document.xpath('//table[contains(@class, "post")]')} return { "id": thread_id, "breadcrumbs": breadcrumbs, "forum_title": forum_title, "thread_title": thread_title, "author": author, "post_count": post_count, "posts": posts, "thread_link": http.prepare_url(THREAD_URL, {'threadid': thread_id}), }
def parse_thread_html(document): """ Parse an LXML document to retrieve the thread data :param document: the LXML document to parse :return: a dictionary representing the thread """ breadcrumbs_elements = document.xpath("//div[@class='breadcrumbs']//a") author_elements = document.xpath("//dt[contains(@class, author)]") last_page_elements = document.xpath("//a[@title='Last page']") if not breadcrumbs_elements: return if not author_elements: return if len(breadcrumbs_elements) < 2: return thread_id = int(breadcrumbs_elements[-1].attrib['href'].rsplit('=', 2)[1]) breadcrumbs = [ e.text_content() for e in breadcrumbs_elements ] thread_title = breadcrumbs[-1] forum_title = breadcrumbs[-2] if author_elements: author = author_elements[0].text_content().strip() else: author = 'Unknown Author' # Handle GBS / FYAD / E/N / etc if ':' in forum_title: forum_title = forum_title.split(':')[0].strip() if forum_title in FORUM_ABBREVS: forum_title = FORUM_ABBREVS[forum_title] if last_page_elements: post_count = int(last_page_elements[0].text_content().split(" ")[0]) else: post_count = 1 return { "id": thread_id, "breadcrumbs": breadcrumbs, "forum_title": forum_title, "thread_title": thread_title, "author": author, "post_count": post_count, "thread_link": http.prepare_url(THREAD_URL, {'threadid': thread_id}) }
def qrcode(inp): """qrcode [link] returns a link for a QR code.""" args = { "cht": "qr", # chart type (QR) "chs": "200x200", # dimensions "chl": inp # data } link = http.prepare_url("http://chart.googleapis.com/chart", args) return web.try_isgd(link)