Пример #1
0
    def parse_thread(self, response):
        threadid = self.get_url_param(response.url, 'id')
        posts = response.css("#brdmain div.blockpost")
        for post in posts:
            try:
                messageitem = items.Message()
                posttime = self.parse_timestr(self.get_text(post.css("h2 a")))

                userprofile_link = post.css(
                    ".postleft dt:first-child a::attr(href)").extract_first()
                messageitem['author_username'] = self.get_text(
                    post.css(".postleft dt:first-child a"))
                messageitem['postid'] = post.xpath("@id").extract_first()
                messageitem['threadid'] = threadid
                messageitem['posted_on'] = posttime

                msg = post.css("div.postmsg")
                messageitem['contenttext'] = self.get_text(msg)
                messageitem['contenthtml'] = self.get_text(msg.extract_first())

                yield messageitem
                yield self.make_request('userprofile',
                                        url=userprofile_link,
                                        relativeurl=userprofile_link)

            except Exception as e:
                self.logger.warning("Invalid thread page. %s" % e)

        for link in response.css("#brdmain .pagelink a::attr(href)").extract():
            yield self.make_request('thread', url=link)
Пример #2
0
    def parse_threadpage(self, response):
        threadid = response.meta['threadid']

        for message in response.css(".messageList .message"):
            msgitem = items.Message()
            try:
                fullid = message.xpath("@id").extract_first()
                msgitem['postid'] = re.match("post-(\d+)", fullid).group(1)
                msgitem['author_username'] = self.get_text(
                    message.css(".messageDetails .username"))
                msgitem['posted_on'] = self.read_datetime_div(
                    message.css(".messageDetails .DateTime"))
                textnode = message.css(".messageContent")
                msgitem['contenthtml'] = textnode.extract_first()
                msgitem['contenttext'] = self.get_text(textnode)
                msgitem['threadid'] = threadid
            except Exception as e:
                self.logger.error(
                    "Failed parsing response for thread at %s. Error is %s.\n Skipping thread\n %s"
                    % (response.url, e.message, traceback.format_exc()))

            yield msgitem

        for link in response.css("a.username::attr(href)").extract(
        ):  # Duplicates will be removed by dupefilter
            yield self.make_request('userprofile', url=self.make_url(link))

        #Start looking for previous page.
        for link in response.css("div.PageNav nav a::attr(href)").extract():
            yield self.make_request("threadpage", url=link, threadid=threadid)
Пример #3
0
    def parse_message(self, response):
        #self.logger.info("Yielding messages from %s" % response.url)
        m = re.search("\?topic=(\d+)", response.url)
        if m:      
            threadid = m.group(1).strip()  

        for postwrapper in response.css(".post_wrapper"):
            messageitem = items.Message()
            postmeta = self.get_text(postwrapper.css(".flow_hidden .keyinfo div"))
            postmeta_ascii = re.sub(r'[^\x00-\x7f]',r'', postmeta).strip()
            m = re.search('on:\s*(.+)', postmeta_ascii)
            if m:
                if "N/A" not in m.group(1):
                    messageitem['posted_on'] = self.parse_timestr(m.group(1))                
            postcontent = postwrapper.css(".postarea .post").xpath("./div[contains(@id, 'msg_')]")

            m = re.search('msg_(\d+)', postcontent.xpath('@id').extract_first())
            if m:
                messageitem['postid'] = m.group(1)

            messageitem['threadid']         = threadid
            messageitem['author_username']  = self.get_text(postwrapper.css(".poster h4"))  
            messageitem['contenthtml']      = postcontent.extract_first()
            messageitem['contenttext']      = self.get_text(postcontent)

            yield messageitem
    def parse_message(self, response):
        #self.logger.info("Yielding messages from %s" % response.url)
        threadid = ""
        try:
            threadid = self.get_url_param(response.url, 'tid')
        except Exception as e:
            self.logger.warning("Couldn't get threadid at %s with error %s" %
                                (response.url, e))
            return

        posts = response.css("#posts div.post")
        for post in posts:
            try:
                messageitem = items.Message()
                posttime = self.get_text(
                    post.css("div.post_head span.post_date")).split("(")[0]
                messageitem['author_username'] = self.get_text(
                    post.xpath(
                        ".//div[@class='author_information']//span[@class='largetext']/a"
                    ))
                messageitem['postid'] = post.xpath("@id").extract_first(
                    " ").replace("post_", "").strip()
                messageitem['threadid'] = threadid
                messageitem['posted_on'] = self.parse_datetime(posttime)
                msg = post.css("div.post_body")
                messageitem['contenttext'] = self.get_text(msg)
                messageitem['contenthtml'] = self.get_text(msg.extract_first())

                yield messageitem
            except Exception as e:
                self.logger.warning("Invalid thread page. %s" % e)
    def parse_message(self, response):
        # try:
        #     threadid = self.get_url_param(response.url, 't')
        # except KeyError:
        #     # It shows one post in thread only, so ignore this page
        #     return
        try:
            threadid = self.get_url_param(response.url, 't')
            posts = response.xpath(
                ".//div[@id='page-body']/div[contains(@id, 'p')]")
            for post in posts:
                messageitem = items.Message()
                messageitem['threadid'] = threadid
                author = post.xpath(
                    './/a[starts-with(@class, "username")]/text()'
                ).extract_first()
                messageitem['author_username'] = author
                post_time = post.css('p.author *::text').extract()
                messageitem['posted_on'] = dateutil.parser.parse(
                    post_time[-1].strip())
                post_link = post.css(
                    'p.author > a::attr(href)').extract_first()
                messageitem['postid'] = self.get_url_param(post_link, 'p')
                msg = post.css("div.content")
                messageitem['contenttext'] = self.get_text(msg)
                messageitem['contenthtml'] = self.get_text(msg.extract_first())

                yield messageitem

        except Exception as e:
            self.logger.warning("Invalid thread page. %s" % e)
            inspect_response(response, self)
Пример #6
0
    def parse_thread(self, response):
        posts = response.xpath('.//div[@class="post "]')
        for post in posts:
            messageitem                     = items.Message()
            guest_user   = len(post.xpath('.//span[contains(text(), "Unregistered")]')) > 0
            special_user = guest_user is True and post.xpath('.//div[@class="author_information"]/strong/span/a[contains(@href, "member")]//text()').extract_first() is not None


            if guest_user is False or special_user is True:
                messageitem['author_username']  = post.xpath('.//div[@class="author_information"]//a[contains(@href, "member")]//text()').extract_first()
                if messageitem['author_username'] is None:
                    messageitem['author_username'] = post.xpath('.//div[@class="author_information"]/strong/span/a[contains(@href, "member")]//text()').extract_first()
                messageitem['postid']           = post.xpath('@id').extract_first().lstrip('post_')
                messageitem['threadid']         = re.search('tid\=([0-9]+)', response.url).group(1)
                msg                             = post.xpath('.//div[contains(@class, "post_body")]')
                messageitem['contenttext']      = self.get_text(msg)
                messageitem['contenthtml']      = self.get_text(msg.extract_first())
                # Post date handling
                posted_on                       = post.xpath('.//span[@class="post_date"]/text()').extract_first()
                messageitem['posted_on']        = self.parse_datetime(posted_on)
            else:
                messageitem['author_username']  = post.xpath('div/div/strong/span/text()').extract_first()
                messageitem['postid']           = post.xpath('@id').extract_first().lstrip('post_')
                messageitem['threadid']         = re.search('tid\=([0-9]+)', response.url).group(1)
                msg                             = post.xpath('.//div[contains(@class, "post_body")]')
                messageitem['contenttext']      = self.get_text(msg)
                messageitem['contenthtml']      = self.get_text(msg.extract_first())
                # Post date handling
                posted_on                       = post.xpath('.//span[@class="post_date"]/text()').extract_first()
                messageitem['posted_on']        = self.parse_datetime(posted_on)                
            if messageitem['author_username'] is None:
                self.logger.warning("Author username is still None at URL: %s. Can't yield item." % response.url)

            yield messageitem

            # Yield user.
            useritem = items.User()
            if guest_user is False or special_user is True:
                useritem['username']        = messageitem['author_username']
                useritem['fullurl']         = post.xpath('.//div[@class="author_information"]//span[@class="largetext"]/a/@href').extract_first()
                useritem['relativeurl']     = useritem['fullurl'].split('.onion')[1]
                useritem['title']           = post.xpath('.//div[@class="author_information"]//span[@class="smalltext"]/text()[1]').extract_first().strip()
                message_count               = post.xpath('.//div[@class="author_statistics"]/text()[2]').extract_first()
                useritem['message_count']   = int(re.sub('[^0-9]', '', message_count))
                post_count                  = post.xpath('.//div[@class="author_statistics"]/text()[3]').extract_first()
                useritem['post_count']      = int(re.sub('[^0-9]', '', post_count))
                useritem['joined_on']       = self.parse_datetime(post.xpath('.//div[@class="author_statistics"]/text()[4]').extract_first().replace("Registrato: ", ''))
                useritem['reputation']      = post.xpath('.//strong[contains(@class, "reputation")]/text()').extract_first()
                useritem['post_count']      = int(re.sub('[^0-9]', '', post_count))
                useritem['username_id']     = re.search('([0-9]+)', useritem['relativeurl']).group(1)
                useritem['membergroup']     = post.xpath('.//img[not(@class="buddy_status")]/@title').extract_first()                
            else:
                # Unregistered users have no message count, join date, post count, reputation, id..
                useritem['username']        = messageitem['author_username']
                useritem['fullurl']         = self.spider_settings['endpoint'] + "/" + useritem['username']
                useritem['relativeurl']     = useritem['username']
                useritem['title']           = post.xpath('.//div[@class="author_information"]//span[@class="smalltext"]/text()[1]').extract_first().strip()

            yield useritem
Пример #7
0
    def parse_message(self, response):
        posts = response.css('ul.row.list-posts > li')
        for post in posts:
            messageitem                     = items.Message()
            author_username_str             = self.get_text(post.css('.post-header a.poster'))
            flair_str                       = self.get_text(post.css('.post-header a.poster span.flair'))
            messageitem["author_username"]  = author_username_str.replace(flair_str, "")
            messageitem['postid']           = self.get_post_id(post.css('span:first-child::attr(id)').extract_first())
            messageitem['threadid']         = self.get_thread_id(response.url)
            messageitem['posted_on']        = self.parse_timestr(self.get_text(post.css('.footer .cols-10 .col-4:first-child strong')))
            msg = post.css("div.content")
            messageitem['contenttext']      = self.get_text(msg)
            messageitem['contenthtml']      = self.get_text(msg.extract_first())

            yield messageitem
Пример #8
0
    def parse_message(self, response):
        if response.xpath('.//div[@class="inner"]/p/text()').extract_first(
        ) and "The requested topic does not exist." in response.xpath(
                './/div[@class="inner"]/p/text()').extract_first():
            self.logger.warning('Post not available. Likely deleted: "%s"' %
                                response.url)
            return
        else:
            m = re.search("t=(\d+)", response.url)
            if m:
                threadid = m.group(1).strip()
            else:
                # If the page has a p= and no t= in the URL, we need to fetch the threadid inside the post.
                threadid = response.xpath(
                    './/h2[@class="topic-title"]/a/@href').extract_first()
                if threadid:
                    threadid = re.search('t=(\d+)', threadid).group(1)
                else:
                    self.logger.warning(
                        "Couldn't identify the threadid at URL %s" %
                        response.url)
                #m = re.search("p=(\d+)", response.url)
                #if m:
                #    threadid = m.group(1).strip()
            posts = response.xpath(
                '//div[contains(@class, "post has-profile")]')
            for post in posts:
                try:
                    messageitem = items.Message()
                    posttime = post.xpath(
                        './/span[@class="responsive-hide"]/following-sibling::text()'
                    ).extract_first()
                    messageitem['author_username'] = post.xpath(
                        './/a[contains(@class, "username")]/text()'
                    ).extract_first()
                    messageitem['postid'] = post.xpath('@id').extract_first()
                    messageitem['threadid'] = threadid
                    if posttime:
                        messageitem['posted_on'] = self.parse_timestr(posttime)

                    msg = post.xpath('.//div[@class="content"]')
                    messageitem['contenttext'] = self.get_text(msg)
                    messageitem['contenthtml'] = self.get_text(
                        msg.extract_first())

                    yield messageitem
                except Exception as e:
                    self.logger.warning("Invalid thread page. %s" % e)
Пример #9
0
    def parse_message(self, response):
        threadid = self.get_url_param(response.url, 'id')
        posts = response.css("#brdmain .blockpost")
        for post in posts:
            # Yield message.
            messageitem = items.Message()
            messageitem['contenthtml'] = post.xpath(
                ".//div[@class='postmsg']").extract_first()
            messageitem['contenttext'] = self.get_text(
                post.xpath(".//div[@class='postmsg']"))
            messageitem['postid'] = self.get_url_param(
                post.css("h2 span a::attr(href)").extract_first(), 'pid')
            messageitem['threadid'] = threadid
            messageitem['author_username'] = self.get_text(
                post.css(".postleft dl dt strong span"))
            messageitem['posted_on'] = self.parse_timestr(
                self.get_text(post.css("h2 span a")))
            yield messageitem
            # Yield user.
            useritem = items.User()
            useritem['username'] = self.get_text(
                post.css(".postleft dl dt strong span"))
            member_group = post.css(".postleft dd.usertitle")
            if len(member_group) > 0:
                useritem['membergroup'] = self.get_text(member_group)
            website = post.css(
                ".postleft dd.usercontacts span.website a::attr(href)")
            if len(website) > 0:
                useritem['website'] = self.get_text(website)
            attributes = post.css(".postleft dd")
            for attribute in attributes:
                if not attribute.css("span::attr(class)"):
                    content = self.get_text(attribute.css("span"))
                    match = re.search('(.+): (.+)', content)
                    if match:
                        key = match.group(1)
                        value = match.group(2)
                        if 'From' in key or 'Lieu' in key:
                            useritem['location'] = value
                        elif 'Posts' in key or 'Messages' in key:
                            useritem['post_count'] = value
                        elif 'Registered' in key or 'Inscription' in key:
                            useritem['joined_on'] = self.parse_timestr(value)
                        else:
                            self.logger.warning('New information found : %s' %
                                                key)

            yield useritem
Пример #10
0
 def parse_message(self, response):
     threadid        = self.get_url_param(response.url, 'id')
     posts           = response.css("#punviewtopic div.blockpost")
     for post in posts:
         try:
             messageitem                     = items.Message()
             posttime                        = self.parse_timestr(self.get_text(post.css("h2 a")))
             messageitem['author_username']  = self.get_text(post.xpath(".//div[@class='postleft']/dl/dt/strong/a/text()").extract_first())
             messageitem['postid']           = post.xpath("@id").extract_first()
             messageitem['threadid']         = threadid
             messageitem['posted_on']        = posttime
             msg                             = post.css("div.postmsg")
             messageitem['contenttext']      = self.get_text(msg)
             messageitem['contenthtml']      = self.get_text(msg.extract_first())
             yield messageitem
         except Exception as e:
             self.logger.warning("Invalid thread page. Error: '%s'. URL:" % (e, response.url))
    def parse_message(self, response):
        # self.logger.info("Yielding messages from %s" % response.url)

        threadid = self.get_url_param(response.url, 'tid')
        posts = response.css("#posts .post")

        for post in posts:
            if not 'deleted_post_hidden' in post.xpath(
                    '@class').extract_first():
                try:
                    post_date_string = self.get_text(
                        post.css('span.post_date::text'))
                    if post_date_string == '':
                        post_date_string = post.css(
                            'span.post_date::text').extract_first()
                    post_date = self.parse_timestr(post_date_string, response)
                    author_username = self.get_text(
                        post.xpath('.//span[@class="largetext"]'))
                    contenttext = post.css('.post_body')
                    match = re.match('post_(\d+)',
                                     post.xpath("@id").extract_first())
                    if match:
                        post_id = match.group(1)
                    messageitem = items.Message()
                    messageitem['author_username'] = author_username
                    messageitem['postid'] = post_id
                    messageitem['threadid'] = threadid
                    messageitem['posted_on'] = post_date
                    messageitem['contenttext'] = self.get_text(contenttext)
                    messageitem['contenthtml'] = contenttext.extract_first()

                    yield messageitem

                except Exception as e:
                    self.logger.warning(
                        "Cannot parse message item at URL %s because %s" %
                        (response.url, e))
                    pass
            else:
                self.logger.warning(
                    "Did not yield post because it was deleted or hidden at %s"
                    % response.url)
Пример #12
0
    def get_message_item_from_postwrapper(self, postwrapper, response):
        msgitem = items.Message()
        postmeta = self.get_text(postwrapper.css(".flow_hidden .keyinfo div"))
        postmeta_ascii = re.sub(r'[^\x00-\x7f]',r'', postmeta).strip()
        m = re.search('on:\s*(.+)', postmeta_ascii)
        if m:
            msgitem['posted_on'] = self.parse_timestr(m.group(1))
            
        postcontent = postwrapper.css(".postarea .post").xpath("./div[contains(@id, 'msg_')]")

        m = re.search('msg_(\d+)', postcontent.xpath('@id').extract_first())
        if m:
            msgitem['postid'] = m.group(1)

        msgitem['threadid']         = response.meta['threadid']
        msgitem['author_username']  = self.get_text(postwrapper.css(".poster h4"))  
        msgitem['contenthtml']      = self.get_text(postcontent.extract_first())
        msgitem['contenttext']      = self.get_text(postcontent)

        return msgitem
    def parse_thread(self, response):
        threadid = response.meta['threadid']
        posts = response.css("#messageList li.message")

        for post in posts:
            try:
                messageitem = items.Message()

                fullid = post.xpath("@id").extract_first()
                content = post.css("blockquote.messageText")
                userprofile_link = post.css(
                    "div.messageDetails a.username.author::attr(href)"
                ).extract_first()

                messageitem['author_username'] = post.xpath(
                    './/div[@class="uix_userTextInner"]/a/text()'
                ).extract_first()
                messageitem['postid'] = re.match("post-(\d+)", fullid).group(1)
                messageitem['threadid'] = threadid
                messageitem['posted_on'] = self.parse_datetime(
                    self.get_text(post.xpath(".//a[@class='datePermalink']")))
                messageitem['contenttext'] = self.get_text(content)
                messageitem['contenthtml'] = self.get_text(
                    content.extract_first())

                yield messageitem

                yield self.make_request(
                    'userprofile',
                    url=userprofile_link,
                    relativeurl=userprofile_link,
                    username=messageitem['author_username'])
            except Exception as e:
                self.logger.warning("Invalid thread page %s. %s" %
                                    (response.url, e))

        for link in response.css(".PageNav nav a::attr(href)").extract():
            yield self.make_request('thread',
                                    url=link,
                                    threadid=response.meta['threadid'])
Пример #14
0
    def parse_thread(self, response):
        inspect_response(response, self)
        threadid = self.get_id_from_url(response.url)
        # We first parse the first post. 
        messageitem = items.Message()
        messageitem['threadid'] = threadid
        messageitem['postid'] = "thread" + threadid
        msg = response.xpath('.//div[@class="col-xs-10 alert alert-info whitebg"]')
        messageitem['contenttext'] = self.get_text(msg)
        messageitem['contenthtml'] = self.get_text(msg.extract_first())        

        # there are 3 user classes. Buyer, vendor and support.
        vendor  = response.xpath(".//div[@class='col-xs-12']/small/a/text()").extract_first() is not None
        support = response.xpath(".//div[@class='col-xs-12']/small/b/text()").extract_first() == 'Support'
        buyer   = vendor is False and support is False
        # Buyer username.
        if buyer is True:
            author_username = response.xpath(".//div[@class='col-xs-12']/small/text()").extract_first().strip()
            author_username = re.search('by (.*)$', author_username).group(1)
            messageitem['author_username'] = author_username
            membergroup = "Buyer"
        # Support staff.
        elif support is True:
            author_username = response.xpath(".//div[@class='col-xs-12']/small/b/text()").extract_first().strip()
            messageitem['author_username'] = author_username
            membergroup = "Support"
        # vendor username.
        elif vendor is True: 
            author_username = response.xpath(".//div[@class='col-xs-12']/small/a/text()").extract_first()
            messageitem['author_username'] = author_username
            membergroup = "Vendor"
        else: 
            self.logger.warning('Unknown member group at %s' % response.url)
        # Get info about the post.
        postinfo = self.get_text(response.xpath(".//div[@class='col-xs-12']/small"))
        if postinfo:
            matches = re.search(r'(\d+) (.+) ago by ([^ ]+)', postinfo)
            messageitem['posted_on'] = self.parse_datetime(matches.group(0))
        else:
            self.logger.warning("No postinfo yielded at %s" % response.url)
        yield messageitem


        user = items.User()
        user['username'] = author_username
        user['membergroup'] = membergroup
        if membergroup in ["Buyer", "Support"]:
            user['relativeurl'] = user['username']
            user['fullurl'] = self.spider_settings['endpoint'] + user['username']
        elif membergroup == "Vendor":
            user['relativeurl'] = response.xpath(".//div[@class='col-xs-12']/small/a/@href").extract_first()
            user['fullurl'] = self.spider_settings['endpoint'] + user['relativeurl']
        else:
            self.logger.warning('Unknown member group at %s' % response.url)

        poster_block = response.xpath(".//div[@class='col-xs-12']")
        if membergroup in ['Buyer', 'Vendor']:
            stars = poster_block.xpath('.//span[@class="nowrap btn-xs alert brightBlueBG"]/text()').extract_first()
            if stars:
                stars = re.search('[Vendor|Buyer]: ([0-9]{1,1000})', stars).group(1)
                user['stars'] = stars
            else:
                self.logger.warning('No stars at URL %s' % response.url)
        yield user

        # We now parse the comments and yield them to the DB.
        # To treat the DB nice and avoid race conditions, sleep for a second.
        #time.sleep(0.5)
        post = response.css('.row .col-lg-8 > div')
        # Parse the remaining comments.
        # Post IDs are not caught by the comment selector. We loop them using an index.
        reply_index = 0
        msg_ids = post.xpath(".//span[@class='forumMsgOffset']")
        for comment in post.css('div.comment p'):
            messageitem = items.Message()
            messageitem['threadid'] = threadid   
            messageitem['postid']   = msg_ids[reply_index].xpath("@id").extract_first()
            reply_index += 1

            post_info = comment.css('small::text').extract_first()
            if post_info:
                matches = re.search(r'(\d+) point([s]*) (.+)', post_info)
                if matches:
                    messageitem['posted_on'] = self.parse_timestr(matches.group(3))
            author_name = comment.css('a.vendorname::text').extract_first()
            if not author_name:
                author_name = comment.css('*::text').extract_first()
            messageitem['author_username'] = author_name.strip()
            messageitem['contenttext'] = ''.join(comment.css('p::text').extract()[1:])
            messageitem['contenthtml'] = self.get_text(comment.css('p').extract_first())
            yield messageitem
        # Sleep again to avoid race condition.
        #time.sleep(0.5)
        for comment in post.css('div.comment p'):
            useritem = items.User()
            vendor  = comment.xpath('.//a[@class="vendorname"]/text()').extract_first() is not None
            buyer   = comment.xpath('.//span[@class="left lightGrey"]').extract_first() is not None and self.get_text(comment).startswith('Support') is False
            support = comment.xpath('.//span/b') is not None and self.get_text(comment).startswith('Support') is True
            if vendor is True:
                useritem['username'] = comment.xpath('.//a[@class="vendorname"]/text()').extract_first()
                useritem['relativeurl'] = comment.xpath('.//a[@class="vendorname"]/@href').extract_first()
                useritem['fullurl'] = self.spider_settings['endpoint'] + useritem['relativeurl']
                membergroup = "Vendor"
                useritem['stars'] = comment.xpath('.//span[@class="nowrap btn-xs alert brightBlueBG"]/text()').extract_first().replace('Vendor: ', '')
            elif support is True:
                username = self.get_text(comment)
                username = re.search('^(Support)[0-9]{1,100} ', username).group(1)
                useritem['username'] = username
                useritem['relativeurl'] = useritem['username']
                useritem['fullurl'] = self.spider_settings['endpoint'] + useritem['username']
                membergroup = "Support"
            elif buyer is True:
                username = self.get_text(comment)
                username = re.search('^(.*?) Buyer', username).group(1)
                useritem['username'] = username
                useritem['relativeurl'] = useritem['username']
                useritem['fullurl'] = self.spider_settings['endpoint'] + useritem['username']
                membergroup = "Buyer"
                useritem['stars'] = comment.xpath('.//span[@class="nowrap btn-xs alert brightBlueBG"]/text()').extract_first().replace('Buyer: ', '')
            else:
                self.logger.warning("Unknown commenter group at %s" % response.url)
            useritem['membergroup'] = membergroup
            yield useritem
    def parse_message(self, response):
        #self.logger.info("Yielding messages from %s" % response.url)
        threadid = self.get_url_param(response.url, 'topic').split(".")[0]
        posts = response.css("#forumposts div.windowbg") + response.css(
            "#forumposts div.windowbg2")

        for post in posts:
            messageitem = items.Message()
            posttime = self.parse_timestr(
                re.search("«.*on:(.*?)»",
                          self.get_text(post.css("div.keyinfo div.smalltext")),
                          re.S | re.M).group(1).strip())

            author_username = post.xpath(".//h4/a/text()").extract_first()
            if author_username is not None:  # Verified posters.
                messageitem['author_username'] = author_username.strip()
            elif post.xpath(".//h4/text()").extract_first() is not None:
                messageitem['author_username'] = post.xpath(
                    ".//h4/text()").extract_first().strip()
            else:
                self.logger.warning('Unknown problem yielding user at URL %s' %
                                    response.url)
            messageitem['postid'] = post.css(
                "div.post div.inner::attr(id)").extract_first().replace(
                    "msg_", "")
            messageitem['threadid'] = threadid
            messageitem['posted_on'] = posttime
            msg = post.css("div.post")
            messageitem['contenttext'] = self.get_text(msg)
            messageitem['contenthtml'] = self.get_text(msg.extract_first())
            yield messageitem

        for post in posts:
            useritem = items.User()
            username = post.xpath(".//h4/a/text()").extract_first()
            if username is not None:  # Verified posters.
                useritem['username'] = username.strip()
                useritem["relativeurl"] = self.get_relative_url(
                    post.css(".poster h4 a::attr(href)").extract_first())
                useritem["fullurl"] = self.make_url(
                    post.css(".poster h4 a::attr(href)").extract_first())
            elif post.xpath(".//h4/text()").extract_first() is not None:
                useritem['username'] = post.xpath(
                    ".//h4/text()").extract_first().strip()
                useritem["relativeurl"] = useritem['username']
                useritem["fullurl"] = self.spider_settings[
                    'endpoint'] + useritem['username']
            else:
                self.logger.warning('Unknown problem yielding user at URL %s' %
                                    response.url)

            for li in post.xpath(".//ul/li"):
                key = li.xpath(".//@class").extract_first()
                keytext = li.xpath(".//text()").extract_first()
                if key == "postgroup":
                    useritem['postgroup'] = keytext
                elif key == "membergroup":
                    useritem['membergroup'] = keytext
                elif key == 'karma':
                    useritem['karma'] = keytext.replace('Karma: ', '')
                elif key == 'title':
                    useritem['title'] = keytext
                elif key == 'stars':
                    useritem['stars'] = keytext
                elif key == 'postcount':
                    useritem['post_count'] = keytext.replace('Posts: ', '')
                elif key == 'custom':
                    awards = li.xpath(".//text()").extract()
                    useritem['awards'] = '|'.join(awards).replace(
                        'Awards: |', '')
                elif key is None or key in [
                        'blurb', 'avatar', 'profile', 'new_win', 'quote',
                        'quote_button'
                ]:
                    pass
                else:
                    self.logger.warning(
                        "Unknown key in user profile '%s' with value '%s'" %
                        (key, keytext))
            yield useritem
Пример #16
0
 def parse_message(self, response):
     threadid = self.get_url_param(response.url, 'id')
     posts = response.css("#brdmain div.blockpost")
     index = 0
     last_posttime = None
     authors = posts.xpath(
         ".//div[@class='postleft']/dl/dt/strong/a/text()").extract()
     for post in posts:
         try:
             messageitem = items.Message()
             userprofile_link = post.css(
                 ".postleft dt:first-child a::attr(href)").extract_first()
             messageitem['author_username'] = self.get_text(
                 post.xpath(
                     ".//div[@class='postleft']/dl/dt/strong/a/text()").
                 extract_first())
             # The admin (SpeedStepper) obfuscates/spoofs their time of posting.
             # Their posts are therefore tagged as coming *just before* the proceding post.
             # SpeedStepper frequently makes 2+ posts in a row, so we need to hack around a bit.
             # a while-loop would be better.
             only_admin = len(list(set(authors))) == 1 and list(
                 set(authors))[0] == 'SpeedStepper'
             if only_admin is True:
                 posttime = None
                 self.logger.warning(
                     "Only SpeedStepper has posted in this thread. No posted_on could be determined from %s."
                     % response.url)
             elif messageitem[
                     'author_username'] == 'SpeedStepper' and index == 0:
                 if self.get_text(posts[index + 1].xpath(
                         "h2/span/a/text()").extract_first()) == '':
                     posttime = self.parse_datetime(
                         self.get_text(
                             posts[index + 2].xpath("h2/span/a/text()").
                             extract_first())) - timedelta(seconds=2)
                 else:
                     posttime = self.parse_datetime(
                         self.get_text(
                             posts[index + 1].xpath("h2/span/a/text()").
                             extract_first())) - timedelta(seconds=1)
                 last_posttime = posttime  # A failsafe ensuring we always have a time we can refer to and accomodate SpeedStepper.
             elif messageitem[
                     'author_username'] == 'SpeedStepper' and index > 0 and last_posttime is not None:
                 posttime = last_posttime + timedelta(seconds=1)
                 last_posttime = posttime  # A failsafe ensuring we always have a time we can refer to and accomodate SpeedStepper.
             else:
                 posttime = self.parse_datetime(
                     self.get_text(
                         post.xpath("h2/span/a/text()").extract_first()))
                 last_posttime = posttime  # A failsafe ensuring we always have a time we can refer to and accomodate SpeedStepper.
             messageitem['posted_on'] = posttime
             messageitem['postid'] = post.xpath("@id").extract_first()
             messageitem['threadid'] = threadid
             #messageitem['subforum'] = self.get_text(response.css('ul.crumbs:nth-child(2) > li:nth-child(2) > a:nth-child(2)'))
             #self.logger.info("subforum is %s" % messageitem['subforum'])
             msg = post.css("div.postmsg")
             messageitem['contenttext'] = self.get_text(msg)
             messageitem['contenthtml'] = self.get_text(msg.extract_first())
             index = index + 1
             yield messageitem
         except Exception as e:
             self.logger.warning("Invalid thread page at %s (Error: '%s'" %
                                 (response.url, e))