예제 #1
0
def get_posts(page_soup):

    page_soup = bs(page_soup)

    #page _ of _
    page_count = page_soup.find('td', attrs={'class': 'vbmenu_control'})
    if page_count:
        page_count = page_count.getText()
        page_match = re.search(r'(\d+) .+? (\d+)', page_count)
        if page_match:
            page_count = int(page_match.group(2))
            page = int(page_match.group(1))
        else:
            page_count = 1
            page = 1
    posts = page_soup.findAll(
        'table', attrs={'id': lambda x: x and re.match(r'post', x)})
    logging.info('get_post: got %d posts' % len(posts))
    post_list = []
    for p in posts:
        post_link = p.find('a',
                           attrs={'name':
                                  lambda x: x and re.match(r'\d+', x)})['href']
        post_string = str(p)
        raw_message = extract(post_string, message_marker[0],
                              message_marker[1])

        date = extract(post_string, date_marker[0], date_marker[1])
        date = strip_tags(date).strip()
        message = get_message(raw_message)
        sig = extract(post_string, sig_marker[0], sig_marker[1])
        edit = extract(post_string, edit_marker[0], edit_marker[1])

        msg_image_srcs = imaget.get_image_src(raw_message)
        if msg_image_srcs: msg_image_srcs = msg_image_srcs[0]
        print "message source: "
        print msg_image_srcs
        print "\n\n\n"

        user = get_user(post_string, sig)

        post_list.append({
            'date': date,
            'message': message,
            'edit': edit,
            'message images': msg_image_srcs,
            'user': user,
            'link': post_link
        })

    return post_list, (page, page_count)
예제 #2
0
def get_posts(page_soup):

    page_soup = bs(page_soup)

    # page _ of _
    page_count = page_soup.find("td", attrs={"class": "vbmenu_control"})
    if page_count:
        page_count = page_count.getText()
        page_match = re.search(r"(\d+) .+? (\d+)", page_count)
        if page_match:
            page_count = int(page_match.group(2))
            page = int(page_match.group(1))
        else:
            page_count = 1
            page = 1
    posts = page_soup.findAll("table", attrs={"id": lambda x: x and re.match(r"post", x)})
    logging.info("get_post: got %d posts" % len(posts))
    post_list = []
    for p in posts:
        post_link = p.find("a", attrs={"name": lambda x: x and re.match(r"\d+", x)})["href"]
        post_string = str(p)
        raw_message = extract(post_string, message_marker[0], message_marker[1])

        date = extract(post_string, date_marker[0], date_marker[1])
        date = strip_tags(date).strip()
        message = get_message(raw_message)
        sig = extract(post_string, sig_marker[0], sig_marker[1])
        edit = extract(post_string, edit_marker[0], edit_marker[1])

        msg_image_srcs = imaget.get_image_src(raw_message)
        if msg_image_srcs:
            msg_image_srcs = msg_image_srcs[0]
        print "message source: "
        print msg_image_srcs
        print "\n\n\n"

        user = get_user(post_string, sig)

        post_list.append(
            {
                "date": date,
                "message": message,
                "edit": edit,
                "message images": msg_image_srcs,
                "user": user,
                "link": post_link,
            }
        )

    return post_list, (page, page_count)
예제 #3
0
    def get_user(self, post_string, sig = ""):

        user_tag = bs(post_string).find('td', attrs={'class':'alt2'})
        user_name = user_tag.find('a', attrs={'class':'bigusername'}).getText()
        user_link = user_tag.find('a', attrs={'class':'bigusername'})['href']
        user_title = user_tag.findAll('div')[1].getText()
        
        user_div = user_tag.findAll('div')
        inner_ind = 2
        while len(user_div[inner_ind].findAll('div'))<3:
            inner_ind+=1
        inner_name_soup = user_div[inner_ind].findAll('div')
        join_date = inner_name_soup[0].getText()[len("Join Date: "):]

        user_image_src = imaget.get_image_src(user_tag, 1)

        return {'tag': user_tag, 'name':user_name, 'ulink': user_link, 'utitle': user_title, 'join': join_date, 'sig': sig, 'image': user_image_src}
예제 #4
0
    def parse(self, src):

        page_soup = bs(src)
        """
        #page _ of _
        page_count = page_soup.find('td', attrs={'class':'vbmenu_control'})
        if page_count:
            page_count = page_count.getText()
            page_match = re.search(r'(\d+) .+? (\d+)', page_count)
            if page_match:
                page_count = int(page_match.group(2))
                page = int(page_match.group(1))
            else:
                page_count = 1
                page = 1
        """
        posts = page_soup.findAll('table', attrs={'id':lambda x: x and re.match(r'post', x)})
        logger.info('get_post: got %d posts' % len(posts))
        post_list = []
        for p in posts:
            post_link = p.find('a', attrs={'name': lambda x: x and re.match(r'\d+', x)})['href']
            post_string = str(p)
            raw_message = self.extract(post_string, message_marker[0], message_marker[1])

            date = self.extract(post_string, date_marker[0], date_marker[1])
            date = self.strip_tags(date).strip()
            message = self.get_message(raw_message)
            sig = self.extract(post_string, sig_marker[0], sig_marker[1])
            edit = self.extract(post_string, edit_marker[0], edit_marker[1])

            msg_image_srcs = imaget.get_image_src(raw_message)
            if msg_image_srcs: msg_image_srcs = msg_image_srcs[0]
            print "message source: " 
            print msg_image_srcs
            print "\n\n\n"

            user = self.get_user(post_string, sig)
            ddict = defaultdict(str)
            ddict.update( dict(
                { 'date': date, 'msg': message,
                    'edit': edit, 'images': msg_image_srcs,
                    'plink': post_link
                }.items() + user.items()))
            post_list.append(ddict)
        return post_list
예제 #5
0
def get_user(post_string, sig=""):

    user_tag = bs(post_string).find("td", attrs={"class": "alt2"})
    user_link = user_tag.find("a", attrs={"class": "bigusername"})
    if not user_link:
        return {
            "tag": user_tag,
            "name": "guest",
            "link": None,
            "join": None,
            "sig": None,
            "image": None,
            "title": "guest",
        }
    user_name = user_link.getText()
    user_link = user_link["href"]
    user_title = user_tag.findAll("div")[1].getText()

    user_div = user_tag.findAll("div")
    inner_ind = 2
    while len(user_div[inner_ind].findAll("div")) < 3:
        inner_ind += 1
    inner_name_soup = user_div[inner_ind].findAll("div")
    join_date = inner_name_soup[0].getText()[len("Join Date: ") :]

    user_image_src = imaget.get_image_src(user_tag, 1)

    return {
        "tag": user_tag,
        "name": user_name,
        "link": user_link,
        "title": user_title,
        "join": join_date,
        "sig": sig,
        "image": user_image_src,
    }