def parse_threadlisting(self, response):
     for line in response.css("#messageindex table tbody tr"):
         threaditem = items.Thread()
         last_post_time = self.parse_timestr(
             self.get_text(
                 line.css("td:last-child")).split("by")[0].strip())
         threadlinkobj = next(iter(
             line.css("td:nth-child(3) span a") or []),
                              None)  # First or None if empty
         if threadlinkobj:
             threadlinkhref = threadlinkobj.xpath(
                 "@href").extract_first() if threadlinkobj else None
             threaditem['title'] = self.get_text(threadlinkobj)
             threaditem['relativeurl'] = self.get_relative_url(
                 threadlinkhref)
             threaditem['fullurl'] = self.make_url(threadlinkhref)
             threaditem['threadid'] = self.get_url_param(
                 threaditem['fullurl'], 'topic').split(".")[0]
             byuser = self.get_text(line.xpath(".//div/p/a"))
             if byuser == '':
                 byuser = line.xpath(
                     ".//div/p[contains(text(), 'Started by')]/text()"
                 ).extract_first().strip().replace("Started by ", "")
             threaditem['author_username'] = byuser
             threaditem['last_update'] = last_post_time
             reply_review = self.get_text(line.css("td:nth-child(4)"))
             threaditem['replies'] = re.search(r"(\d+) Replies",
                                               reply_review,
                                               re.S | re.M).group(1)
             threaditem['views'] = re.search(r"(\d+) Views", reply_review,
                                             re.S | re.M).group(1)
             yield threaditem
         else:
             self.logger.warning(
                 'Couldn\'t yield thread. Please review: %s' % response.url)
Пример #2
0
    def parse_thread_listing(self, response):      
        for line in response.css("#brdmain tbody tr"):
            threadlinkobj = next(iter(line.css("td:first-child a") or []), None)  # Get Thread Name link, or None if not present
            if threadlinkobj:
                threaditem = items.Thread()            
                threadlinkhref = threadlinkobj.xpath("@href").extract_first() if threadlinkobj else None
                
                threaditem['title'] = self.get_text(threadlinkobj)
                threaditem['relativeurl'] = threadlinkhref
                threaditem['fullurl']   = self.make_url(threadlinkhref)
                
                threaditem['threadid'] = self.get_url_param(threaditem['fullurl'], 'id')

                byuser = self.get_text(line.css("td:first-child span.byuser"))
                m = re.match("by (.+)", byuser)
                if m:
                    threaditem['author_username'] = m.group(1)
                
                threaditem['last_update'] = self.parse_timestr(self.get_text(line.css("td:last-child a")))
                
                threaditem['replies']   = self.get_text(line.css("td:nth-child(2)"))
                threaditem['views']     = self.get_text(line.css("td:nth-child(3)"))
                yield threaditem

                yield self.make_request('thread', url=threadlinkhref)

        for link in response.css("#brdmain .pagelink a::attr(href)").extract():
            yield self.make_request('threadlisting', url=link)
Пример #3
0
 def parse_threadlisting(self, response):
     for line in response.css("#brdmain tbody tr"):
         threaditem = items.Thread()
         title = self.get_text(line.css("td:first-child a"))
         threadlinkobj = next(iter(line.css("td:first-child a") or []),
                              None)  # First or None if empty
         if line.xpath(".//span[@class='movedtext']"):
             self.logger.warning("Thread was moved. Not collected.")
         elif threadlinkobj:
             last_post_time = self.parse_datetime(
                 self.get_text(line.css("td:last-child a")))
             threadlinkhref = threadlinkobj.xpath(
                 "@href").extract_first() if threadlinkobj else None
             threaditem['title'] = self.get_text(threadlinkobj)
             if threaditem['title'] == '':
                 threaditem['title'] = "[Untitled thread]"
                 self.logger.warning(
                     "Encountered a thread with no title at %s. Inserted %s as title."
                     % (response.url, threaditem['title']))
             threaditem['relativeurl'] = threadlinkhref
             threaditem['fullurl'] = self.make_url(threadlinkhref)
             threaditem['threadid'] = self.get_url_param(
                 threaditem['fullurl'], 'id')
             byuser = self.get_text(line.css("td:first-child span.byuser"))
             m = re.match("by (.+)", byuser)  # regex
             if m:
                 threaditem['author_username'] = m.group(1)
             threaditem['last_update'] = last_post_time
             threaditem['replies'] = self.get_text(
                 line.css("td:nth-child(2)"))
             threaditem['views'] = self.get_text(
                 line.css("td:nth-child(3)"))
         else:
             self.logger.warning("no threadlinkobj")
         yield threaditem
Пример #4
0
    def parse_threadlisting(self, response):
        threads = response.css('#vf table tbody tr')
        for thread in threads:
            try:
                threadlink = thread.css('td:first-child a')
                threadurl = thread.css(
                    'td:first-child a::attr(href)').extract_first()
                thread_last_update = self.get_text(
                    thread.css('td:last-child a'))

                threaditem = items.Thread()
                threaditem['threadid'] = self.get_url_param(threadurl, 'id')
                threaditem['title'] = thread.xpath(
                    ".//a[contains(@href, 'viewtopic')]/text()").extract_first(
                    )
                threaditem['author_username'] = self.get_text(
                    thread.css('td:first-child span.byuser span'))
                threaditem['last_update'] = self.parse_timestr(
                    thread_last_update)
                threaditem['relativeurl'] = threadurl
                threaditem['fullurl'] = self.make_url(threadurl)
                threaditem['replies'] = self.get_text(
                    thread.css('td:nth-child(2)'))
                threaditem['views'] = self.get_text(
                    thread.css('td:nth-child(3)'))

                yield threaditem

            except Exception as e:
                self.logger.error(
                    "Cannot parse thread item at %s (Error: %s)" %
                    (response.url, e))
Пример #5
0
    def parse_threadlisting(self, response):
        topics = response.css('ul.row.big-list.zebra > li')
        for topic in topics:
            threaditem = items.Thread()
            threaditem['title'] = self.get_text(
                topic.css("div.main > div > a"))

            href = topic.css("div.main > div > a::attr(href)").extract_first()
            threaditem['relativeurl'] = self.get_relative_url(href)
            if href != "":
                threaditem['fullurl'] = self.make_url(href)
            threadid = self.get_thread_id(href)
            threaditem['threadid'] = threadid
            threaditem['author_username'] = topic.css(
                "div.main > div > span a::text").extract_first("").strip()
            replies = self.get_text(
                topic.css("div.main > div > span strong:last-child"))
            if re.match(r'^\d+$', replies) is None:
                replies = 0
            threaditem['replies'] = replies
            yield threaditem

            flair = topic.css(
                "div.main > div > span a::attr(data-flair)"
                ).extract_first()

            if flair is not None:
                user = items.User()
                user["username"] = topic.css(
                    "div.main > div > span a::text").extract_first("").strip()
                user["flair"] = flair.strip()
                user['fullurl'] = topic.css(
                    "div.main > div > span a::attr(href)").extract_first("").strip()
                user["relativeurl"] = self.get_relative_url(user['fullurl'])
                yield user
    def parse_board(self, response):
        for threadline in response.css('#messageindex table tbody tr'):
            try:
                threaditem = items.Thread()

                threadcell = threadline.css(".subject")
                authorlink = threadcell.xpath(
                    ".//p[contains(., 'Started by')]").css('a')
                threadlink = threadcell.xpath(
                    './/span[contains(@id, "msg_")]/a')

                threaditem['author_username'] = self.get_text_first(authorlink)
                threadurl = threadlink.xpath("@href").extract_first()

                m = re.search("\?topic=(\d+)", threadurl)
                if m:
                    threaditem['threadid'] = m.group(1).strip()
                threaditem['title'] = self.get_text(threadlink)
                threaditem['relativeurl'] = threadurl
                threaditem['fullurl'] = self.make_url(threadurl)

                #Last update
                lastpost_str = self.get_text(threadline.css(".lastpost"))
                m = re.search("(.+) by (.+)", lastpost_str)
                if m:
                    threaditem['last_update'] = self.parse_timestr(m.group(1))

                #Stats cell
                statcellcontent = self.get_text(threadline.css("td.stats"))
                m = re.search("(\d+) Replies [^\d]+(\d+) Views",
                              statcellcontent)
                if m:
                    threaditem['replies'] = m.group(1)
                    threaditem['views'] = m.group(2)

                yield threaditem

                for pagelink in response.css(".pagelinks a.navPages"):
                    yield self.make_request(
                        'board', url=pagelink.xpath("@href").extract_first())

                for userlink in threadline.xpath(
                        './/a[contains(@href, "action=profile")]'):
                    u = userlink.xpath("@href").extract_first()
                    yield self.make_request('userprofile',
                                            url=u,
                                            relativeurl=u)

                for threadlink in threadline.xpath(
                        './/a[contains(@href, "?topic=") and not(contains(@href, "#new"))]'
                ):
                    yield self.make_request(
                        'thread',
                        url=threadlink.xpath("@href").extract_first(),
                        threadid=threaditem['threadid'])
            except Exception as e:
                self.logger.error("Cannot parse thread item : %s" % e)
                raise
Пример #7
0
 def parse_threadlisting(self, response):
     topics = response.xpath('.//tr[@class="inline_row"]')
     for topic in topics:
         threaditem                    = items.Thread()
         threaditem['title']           = topic.xpath('.//span[contains(@id, "tid")]/a/text()').extract_first()
         threaditem['relativeurl']     = topic.xpath('.//span[contains(@id, "tid")]/a/@href').extract_first()
         threaditem['fullurl']         = self.make_url(threaditem['relativeurl'])
         threaditem['threadid']        = re.search('([0-9]+)', threaditem['relativeurl']).group(1)
         threaditem['author_username'] = topic.xpath('.//div[contains(@class, "author")]/a/text()').extract_first()
         threaditem['replies']         = re.sub('[^0-9]', '', topic.xpath('.//td[4]/a/text()').extract_first())
         threaditem['views']           = re.sub('[^0-9]', '', topic.xpath('.//td[5]/text()').extract_first())
         # Last update handling
         lastupdate = topic.xpath('.//span[contains(@class, "lastpost")]/text()[1]').extract_first()
         threaditem['last_update'] = self.parse_datetime(lastupdate)
                    
         yield threaditem
    def parse_threadlisting(self, response):
        # self.logger.info("Yielding threads from %s" % response.url)

        threads = response.css("#content tr.inline_row")

        for thread in threads:
            try:
                threaditem = items.Thread()
                threadlink = thread.css("td:nth-child(3)").xpath(
                    ".//span[contains(@id, 'tid_')]/a")
                threaditem['title'] = self.get_text(threadlink)
                # Handle deleted threads.
                deleted_thread = thread.xpath(
                    './/td/em/text()').extract_first()
                if len(threadlink) < 1 and deleted_thread == 'Deleted Thread':
                    self.logger.warning(
                        "A deleted thread was not collected from %s." %
                        response.url)
                else:
                    threadurl = threadlink.xpath('@href').extract_first()
                    lastpost_content = self.get_text(
                        thread.css("td:last-child span.lastpost"))
                    match = re.search("(.+)Ultimo", lastpost_content)
                    last_post_time = self.parse_timestr(
                        match.group(1), response) if match else None

                    threaditem['threadid'] = self.get_url_param(
                        threadurl, 'tid')
                    threaditem['relativeurl'] = threadurl
                    threaditem['fullurl'] = self.make_url(threadurl)
                    threaditem['author_username'] = self.get_text(
                        thread.css("td:nth-child(3) div.author a"))
                    threaditem['last_update'] = last_post_time
                    threaditem['replies'] = self.get_text(
                        thread.css("td:nth-child(4) a"))
                    threaditem['views'] = self.get_text(
                        thread.css("td:nth-child(5)"))

                    yield threaditem

            except Exception as e:
                self.logger.warning(
                    "Cannot parse thread item at URL %s because %s" %
                    (response.url, e))
                pass
    def parse_threadlisting(self, response):
        for line in response.css("div.discussionList li.discussionListItem"):
            threaditem = items.Thread()

            threadlink = line.css(
                "div.main h3.title a::attr(href)").extract_first()
            threadid = self.read_threadid_from_url(threadlink)

            threaditem['title'] = self.get_text(
                line.css("div.main h3.title a"))
            threaditem['author_username'] = line.xpath(
                '@data-author').extract_first()
            threaditem['replies'] = self.get_text(
                line.css("div.stats .major dd"))
            threaditem['views'] = self.get_text(
                line.css("div.stats .minor dd"))
            # last_update comes in two formats with different layout.
            short_timestring = line.xpath(
                ".//span[@class='DateTime']/text()").extract_first()
            long_timestring = line.xpath(
                ".//abbr[@class='DateTime']/text()").extract_first()
            if long_timestring is not None:
                threaditem['last_update'] = self.parse_datetime(
                    long_timestring)
            elif long_timestring is None or short_timestring is not None:
                threaditem['last_update'] = self.parse_datetime(
                    short_timestring)
            else:
                self.logger.warning(
                    "Couldn't get the correct time for the last update of post at %s."
                    % response.url)

            threaditem['relativeurl'] = threadlink
            threaditem['fullurl'] = self.make_url(threadlink)
            threaditem['threadid'] = threadid

            yield threaditem
            yield self.make_request('thread',
                                    url=threadlink,
                                    threadid=threadid)

        for link in response.css(".PageNav nav a::attr(href)").extract():
            yield self.make_request('threadlisting', url=link)
Пример #10
0
    def parse_threadlisting(self, response):
        threaddivs = response.css("li.discussionListItem")
        oldestthread_datetime = datetime.utcnow()
        for threaddiv in threaddivs:
            try:
                threaditem = items.Thread()
                last_message_datestr = threaddiv.css(
                    ".lastPostInfo .DateTime::text").extract_first()
                threaditem['last_update'] = self.to_utc(
                    AlphabayDatetimeParser.tryparse(last_message_datestr))
                oldestthread_datetime = threaditem[
                    'last_update']  # We assume that threads are ordered by time.

                link = threaddiv.css(".title a.PreviewTooltip")
                threadurl = link.xpath("@href").extract_first()
                threaditem['relativeurl'] = threadurl
                threaditem['fullurl'] = self.make_url(threadurl)
                threaditem['title'] = self.get_text_first(link)
                threaditem['author_username'] = self.get_text_first(
                    threaddiv.css(".username"))
                threaditem['threadid'] = self.read_threadid_from_url(threadurl)

                author_url = threaddiv.css(
                    ".username::attr(href)").extract_first()

                yield self.make_request('userprofile', url=author_url)
                yield self.make_request(
                    'threadpage',
                    url=threadurl,
                    threadid=threaditem['threadid'])  # First page of threa

                yield threaditem  # sends data to pipelne

            except Exception as e:
                self.logger.error(
                    "Failed parsing response for threadlisting at %s. Error is %s.\n Skipping thread\n %s"
                    % (response.url, e.message, traceback.format_exc()))
                continue

        # Parse next page.
        for link in response.css("div.PageNav nav a::attr(href)").extract():
            yield self.make_request(reqtype='threadlisting', url=link)
Пример #11
0
 def parse_threadlisting(self, response):
     for line in response.css('#punviewforum tbody tr:not([class*="inone"])'):
         threaditem          = items.Thread()
         last_post_time      = self.parse_timestr(self.get_text(line.css("td:last-child a")))
         # First or None if empty
         threadlinkobj       = next(iter(line.css("td:first-child a") or []), None)
         if threadlinkobj:
             threadlinkhref              = threadlinkobj.xpath("@href").extract_first() if threadlinkobj else None
             threaditem['title']         = self.get_text(threadlinkobj)
             threaditem['relativeurl']   = threadlinkhref
             threaditem['fullurl']       = self.make_url(threadlinkhref)
             threaditem['threadid']      = self.get_url_param(threaditem['fullurl'],'id')
             byuser                      = self.get_text(line.css("td:first-child span.byuser"))
             m = re.match("by (.+)", byuser)  # regex
             if m:
                 threaditem['author_username'] = m.group(1)
             threaditem['last_update']   = last_post_time
             threaditem['replies']       = self.get_text(line.css("td:nth-child(2)"))
             threaditem['views']         = self.get_text(line.css("td:nth-child(3)"))
         yield threaditem
Пример #12
0
    def parse_threadlisting(self, response):
        for threadline in response.css('#messageindex table tbody tr'):

            try:
                threaditem = items.Thread()

                threadcell = threadline.css(".subject")
                authorlink = threadcell.xpath(".//p[contains(., 'Started by')]").css('a')
                threadlink = threadcell.xpath('.//span[contains(@id, "msg_")]/a')

                threaditem['author_username'] = self.get_text_first(authorlink)
                threadurl = threadlink.xpath("@href").extract_first()
                
                m = re.search("\?topic=(\d+)", threadurl)
                if m:
                    threaditem['threadid'] = m.group(1).strip()
                threaditem['title'] = self.get_text(threadlink)
                threaditem['relativeurl'] = threadurl
                threaditem['fullurl'] = self.make_url(threadurl)

                #Last update
                lastpost_str = threadline.xpath('td[contains(@class, "lastpost")]/a/following-sibling::text()').extract_first()
                if lastpost_str:
                    if "N/A" not in lastpost_str:
                        threaditem['last_update'] = self.parse_timestr(lastpost_str.strip())

                #Stats cell
                statcellcontent = threadline.xpath('td[contains(@class, "stats")]//text()').extract()
                m1 = re.search("(\d+) Replies", statcellcontent[0])
                if m1 :
                    threaditem['replies'] = m1.group(1)

                m2 = re.search("[^\d]+(\d+) Views", statcellcontent[1])
                if m2 :
                    threaditem['views'] = m2.group(1)

                yield threaditem

            except Exception as e:
                self.logger.error("Cannot parse thread item : %s" % e)
                raise
Пример #13
0
    def parse_thread_listing(self, response):
        for line in response.css('.table.forum > tbody > tr'):
            try:
                cells = line.css('td')
                if len(cells) != 4:
                    continue

                thread_link = cells[1].css('h4 div a::attr(href)').extract_first()
                if not thread_link:
                    continue

                threaditem = items.Thread()
                threaditem['title'] = cells[1].css('h4 div a::text').extract_first()
                threaditem['relativeurl'] = thread_link
                threaditem['fullurl'] = self.make_url(thread_link)
                threaditem['threadid'] = self.get_id_from_url(thread_link)

                author = cells[1].css('h4 div small a')
                if author:
                    threaditem['author_username'] = author.css('::text').extract_first().strip()
                else:
                    byuser = cells[1].xpath('.//h4/div/small//text()').extract()
                    byuser = ''.join(byuser)
                    if byuser:
                        matches = re.search(" ago by (.+)", byuser) # regex
                        if matches:
                            threaditem['author_username'] = matches.group(1).strip()
                # Cannot get last update time exactly, that's because the update time
                # doesn't follow time format, it's something like "XX days ago".
                moment_time_value = cells[3].css('small::text').extract()[-1]
                threaditem['last_update'] = self.parse_datetime(moment_time_value).date()
                threaditem['replies'] = cells[2].css('::text').extract_first()
                yield threaditem

                yield self.make_request('thread', url=thread_link, shared=True)

            except Exception as ex:
                self.logger.warning("Error in retrieving theads. %s at URL %s" % (ex, response.url))

        for link in response.css("a.paginate[rel='next']::attr(href)").extract():
            yield self.make_request('threadlisting', url=link, shared=True)
Пример #14
0
    def parse_thread_listing(self, response):
        topics = response.css('ul.row.big-list.zebra > li')
        for topic in topics:
            threaditem = items.Thread()
            threaditem['title'] = self.get_text(
                topic.css("div.main > div > a"))

            href = topic.css("div.main > div > a::attr(href)").extract_first()
            threaditem['relativeurl'] = href
            threaditem['fullurl'] = self.make_url(href)
            threadid = self.get_thread_id(href)
            threaditem['threadid'] = threadid
            threaditem['author_username'] = topic.css(
                "div.main > div > span a::text").extract_first()

            replies = self.get_text(
                topic.css("div.main > div > span strong:last-child"))
            if re.match(r'^\d+$', replies) is None:
                replies = 0
            threaditem['replies'] = replies

            yield threaditem
    def parse_threadlisting(self, response):
        #self.logger.info("Yielding threads from %s" % response.url)
        for line in response.css("div.wrapper table tr.inline_row"):
            threaditem = items.Thread()

            threaditem['title'] = self.get_text(
                line.xpath("td[3]/div/span/span/a"))
            if threaditem['title'] == "":
                continue

            threaditem['replies'] = self.get_text(line.css("td:nth-child(4)"))
            threaditem['views'] = self.get_text(line.css("td:nth-child(5)"))
            threaditem['relativeurl'] = line.xpath(
                "td[3]/div/span/span/a/@href").extract_first()
            threaditem['fullurl'] = self.make_url(threaditem['relativeurl'])
            last_post_time = self.get_text(
                line.css("td:nth-child(6) span.lastpost"))
            try:
                threaditem['last_update'] = self.parse_datetime(
                    re.search("(.*)last ", last_post_time,
                              re.M | re.I | re.S).group(1).strip())
            except Exception as e:
                self.logger.warning("last_update %s error %s" %
                                    (response.url, e))

            try:
                threaditem['author_username'] = re.search(
                    "post:(.*)", last_post_time,
                    re.M | re.I | re.S).group(1).strip()
            except Exception as e:
                self.logger.warning("author_username %s error value %s" %
                                    (response.url, e))

            threaditem['threadid'] = self.get_url_param(
                threaditem['fullurl'], 'tid')

            yield threaditem
Пример #16
0
    def parse_threadlisting(self, response):

        for line in response.xpath('//ul[@class="topiclist topics"]/li'):
            threaditem = items.Thread()
            title = line.xpath(
                './/a[@class="topictitle"]/text()').extract_first()
            last_post_time = self.parse_timestr(
                line.xpath(
                    './/a[@title="Go to last post"]/text()').extract_first())
            threaditem['title'] = line.xpath(
                './/a[@class="topictitle"]/text()').extract_first()
            threaditem['relativeurl'] = line.xpath(
                './/a[@class="topictitle"]/@href').extract_first()
            threaditem['fullurl'] = self.make_url(threaditem['relativeurl'])
            threaditem['threadid'] = threaditem['relativeurl'].split('&t=')[-1]
            threaditem['author_username'] = line.xpath(
                './/a[contains(@class, "username")]/text()').extract_first()
            threaditem['last_update'] = last_post_time
            threaditem['replies'] = line.xpath(
                './/dd[@class="posts"]/text()').extract_first().strip()
            threaditem['views'] = line.xpath(
                './/dd[@class="views"]/text()').extract_first().strip()

            yield threaditem
 def parse_threadlisting(self, response):
     # self.logger.info("Yielding threads from %s" % response.url)
     for line in response.css("ul.topiclist.topics li.row"):
         try:
             title = line.css("dt div.list-inner > a")
             # if not title:
             #     continue
             threaditem = items.Thread()
             threaditem['title'] = self.get_text(title)
             threaditem['relativeurl'] = title.xpath(
                 '@href').extract_first()
             threaditem['fullurl'] = self.make_url(
                 threaditem['relativeurl'])
             threaditem['threadid'] = self.get_url_param(
                 threaditem['fullurl'], 't')
             threaditem['author_username'] = line.css(
                 'div.topic-poster a::text').extract_first()
             threaditem['replies'] = line.css(
                 'dd.posts *::text').extract_first().strip()
             threaditem['views'] = line.css(
                 'dd.views *::text').extract_first().strip()
             yield threaditem
         except Exception as e:
             self.logger.warning("Invalid thread listing page. %s" % e)