def get_posts(page_soup): page_soup = bs(page_soup) #page _ of _ page_count = page_soup.find('td', attrs={'class': 'vbmenu_control'}) if page_count: page_count = page_count.getText() page_match = re.search(r'(\d+) .+? (\d+)', page_count) if page_match: page_count = int(page_match.group(2)) page = int(page_match.group(1)) else: page_count = 1 page = 1 posts = page_soup.findAll( 'table', attrs={'id': lambda x: x and re.match(r'post', x)}) logging.info('get_post: got %d posts' % len(posts)) post_list = [] for p in posts: post_link = p.find('a', attrs={'name': lambda x: x and re.match(r'\d+', x)})['href'] post_string = str(p) raw_message = extract(post_string, message_marker[0], message_marker[1]) date = extract(post_string, date_marker[0], date_marker[1]) date = strip_tags(date).strip() message = get_message(raw_message) sig = extract(post_string, sig_marker[0], sig_marker[1]) edit = extract(post_string, edit_marker[0], edit_marker[1]) msg_image_srcs = imaget.get_image_src(raw_message) if msg_image_srcs: msg_image_srcs = msg_image_srcs[0] print "message source: " print msg_image_srcs print "\n\n\n" user = get_user(post_string, sig) post_list.append({ 'date': date, 'message': message, 'edit': edit, 'message images': msg_image_srcs, 'user': user, 'link': post_link }) return post_list, (page, page_count)
def get_posts(page_soup): page_soup = bs(page_soup) # page _ of _ page_count = page_soup.find("td", attrs={"class": "vbmenu_control"}) if page_count: page_count = page_count.getText() page_match = re.search(r"(\d+) .+? (\d+)", page_count) if page_match: page_count = int(page_match.group(2)) page = int(page_match.group(1)) else: page_count = 1 page = 1 posts = page_soup.findAll("table", attrs={"id": lambda x: x and re.match(r"post", x)}) logging.info("get_post: got %d posts" % len(posts)) post_list = [] for p in posts: post_link = p.find("a", attrs={"name": lambda x: x and re.match(r"\d+", x)})["href"] post_string = str(p) raw_message = extract(post_string, message_marker[0], message_marker[1]) date = extract(post_string, date_marker[0], date_marker[1]) date = strip_tags(date).strip() message = get_message(raw_message) sig = extract(post_string, sig_marker[0], sig_marker[1]) edit = extract(post_string, edit_marker[0], edit_marker[1]) msg_image_srcs = imaget.get_image_src(raw_message) if msg_image_srcs: msg_image_srcs = msg_image_srcs[0] print "message source: " print msg_image_srcs print "\n\n\n" user = get_user(post_string, sig) post_list.append( { "date": date, "message": message, "edit": edit, "message images": msg_image_srcs, "user": user, "link": post_link, } ) return post_list, (page, page_count)
def get_user(self, post_string, sig = ""): user_tag = bs(post_string).find('td', attrs={'class':'alt2'}) user_name = user_tag.find('a', attrs={'class':'bigusername'}).getText() user_link = user_tag.find('a', attrs={'class':'bigusername'})['href'] user_title = user_tag.findAll('div')[1].getText() user_div = user_tag.findAll('div') inner_ind = 2 while len(user_div[inner_ind].findAll('div'))<3: inner_ind+=1 inner_name_soup = user_div[inner_ind].findAll('div') join_date = inner_name_soup[0].getText()[len("Join Date: "):] user_image_src = imaget.get_image_src(user_tag, 1) return {'tag': user_tag, 'name':user_name, 'ulink': user_link, 'utitle': user_title, 'join': join_date, 'sig': sig, 'image': user_image_src}
def parse(self, src): page_soup = bs(src) """ #page _ of _ page_count = page_soup.find('td', attrs={'class':'vbmenu_control'}) if page_count: page_count = page_count.getText() page_match = re.search(r'(\d+) .+? (\d+)', page_count) if page_match: page_count = int(page_match.group(2)) page = int(page_match.group(1)) else: page_count = 1 page = 1 """ posts = page_soup.findAll('table', attrs={'id':lambda x: x and re.match(r'post', x)}) logger.info('get_post: got %d posts' % len(posts)) post_list = [] for p in posts: post_link = p.find('a', attrs={'name': lambda x: x and re.match(r'\d+', x)})['href'] post_string = str(p) raw_message = self.extract(post_string, message_marker[0], message_marker[1]) date = self.extract(post_string, date_marker[0], date_marker[1]) date = self.strip_tags(date).strip() message = self.get_message(raw_message) sig = self.extract(post_string, sig_marker[0], sig_marker[1]) edit = self.extract(post_string, edit_marker[0], edit_marker[1]) msg_image_srcs = imaget.get_image_src(raw_message) if msg_image_srcs: msg_image_srcs = msg_image_srcs[0] print "message source: " print msg_image_srcs print "\n\n\n" user = self.get_user(post_string, sig) ddict = defaultdict(str) ddict.update( dict( { 'date': date, 'msg': message, 'edit': edit, 'images': msg_image_srcs, 'plink': post_link }.items() + user.items())) post_list.append(ddict) return post_list
def get_user(post_string, sig=""): user_tag = bs(post_string).find("td", attrs={"class": "alt2"}) user_link = user_tag.find("a", attrs={"class": "bigusername"}) if not user_link: return { "tag": user_tag, "name": "guest", "link": None, "join": None, "sig": None, "image": None, "title": "guest", } user_name = user_link.getText() user_link = user_link["href"] user_title = user_tag.findAll("div")[1].getText() user_div = user_tag.findAll("div") inner_ind = 2 while len(user_div[inner_ind].findAll("div")) < 3: inner_ind += 1 inner_name_soup = user_div[inner_ind].findAll("div") join_date = inner_name_soup[0].getText()[len("Join Date: ") :] user_image_src = imaget.get_image_src(user_tag, 1) return { "tag": user_tag, "name": user_name, "link": user_link, "title": user_title, "join": join_date, "sig": sig, "image": user_image_src, }