def _parse_post_info(self, post_info_td): """ post_info_td DOM <td class="author"> <div class="msg_stats_left"> <span class="author_data"><a name="1"></a>#1</span> <span class="author_data"> <a href="/users/jumi/boards" class="name"> <b>jumi</b> </a> </span> *** OPTIONAL *** <span class="author_data">(Topic Creator)</span> *** /OPTIONAL *** <span class="author_data">Posted 4/5/2014 1:56:08 PM</span> <span class="author_data"> <a href="/boards/2000121-anime-and-manga-other-titles/68960382/779732076">message detail</a> </span> </div> </td> NOTE: archived topics are different (creator is not in span class) """ post_infos = list(post_info_td.div.children) post_num = post_infos[0].a["name"] post_creator = post_infos[1].text try: dt_raw = " ".join(post_infos[2].text.split()) post_dt = strptime(dt_raw, POST_DATE_FORMAT_STR) except ValueError, e: dt_raw = " ".join(post_infos[3].text.split()) post_dt = strptime(dt_raw, POST_DATE_FORMAT_STR)
def _parse_topic(self, topic_tr): """ Parses a topic row and returns Topic object DOM layout <tr class="topics"> <td class="board_status"> <i class="board_icon board_icon_topic"></i> </td> <td class="topic"> <a href="**TOPIC_LINK**">**TOPIC_TITLE**</a> <br><span class="pglist">..</span> </td> <td class="tauthor"> <span> <a>**USERNAME**</a></span> </td> <td class="count">23</td> <td class="lastpost"><a href="">4/5 3:20PM</a> </td> </tr> """ tds = topic_tr.find_all("td") assert tds assert len(tds) == 5, "Topic html invalid format (%s)" % self.base_url() # get topic icon status_img_el = list(tds[0].children)[0] status_img = status_img_el["class"][-1] if status_img not in TOPIC_STATUS_MAP: logger.warn("Topic status %s unknown" % status_img) topic_status = Topic.NORMAL else: topic_status = TOPIC_STATUS_MAP.get(status_img) # get topic title topic_gfaqs_id = tds[1].a["href"].split("/")[-1] topic_title = tds[1].a.text # get creator username username = tds[2].text.split() # we split because username might have (M) at the end if "(M)" in username[-1]: username = username[:-1] username = '******'.join(username) creator = User(username=username) # get post count of topic post_count = int(tds[3].text) # get laste post date of topic try: date_raw = tds[4].a.text dt = strptime(date_raw, TOPIC_DATE_FORMAT_STR) # the year is not sepcified on gfaqs, # so I'll set it to the current year curr_year = datetime.now().year dt = datetime(curr_year, dt.month, dt.day, dt.hour, dt.minute, dt.second) except ValueError: # archived topic, use alternative format str dt = strptime(date_raw, TOPIC_DATE_ALT_FORMAT_STR) return Topic(board=self.board, creator=creator, gfaqs_id=topic_gfaqs_id, title=topic_title, number_of_posts=post_count, last_post_date=dt, status=topic_status)