def parse_item(self, response):
        if "threads" in response.url:
            # little trick for allow use only two rules
            items = []

            posts = response.xpath('//ol[@id="posts"]/li')
            url = response.url
            subject = response.xpath('//span[@class="threadtitle"]//text()').extract()[0]
            for post in posts:
                item = PostItemsList()
                author = post.xpath('.//a[contains(@class, "username")]//text()').extract()[0]
                author_link = post.xpath('.//a[contains(@class, "username")]/@href').extract()[0]
                create_date = post.xpath('.//span[@class="date"]//text()').extract()
                # clean create_date
                create_date = u" ".join(date.strip() for date in create_date)
                message = post.xpath('.//div[@class="content"]//text()').extract()
                # clean message
                message = u"".join(msg.strip() for msg in message)

                item["author"] = author
                item["author_link"] = author_link
                item["create_date"] = create_date
                item["post"] = message
                item["tag"] = "epilepsy"
                item["topic"] = subject
                item["url"] = url

                logging.info(item.__str__())
                items.append(item)

            return items
    def parsePostsList(self, response):
        items = []
        posts = response.xpath(
            '//div[@class="post bg2"] | //div[@class="post bg1"]')

        url = response.url
        subject = response.xpath('//div[@id="page-body"]/h2/a/text()')\
            .extract()[0]

        for post in posts:
            author = post.xpath(
                './/p[@class="author"]/strong/a/text()').extract()[0]

            author_link = post.xpath(
                './/p[@class="author"]/strong/a/@href').extract()[0]

            create_date = post.xpath(
                './/p[@class="author"]/text()').extract()[1]
            create_date = create_date.replace(u" \xbb", u"")

            message = " ".join(
                post.xpath('.//div[@class="content"]//text()').extract())
            message = cleanText(message)

            item = PostItemsList()
            item['author'] = author
            item['author_link'] = author_link
            item['create_date'] = create_date
            item['post'] = message
            item['tag'] = 'epilepsy'
            item['topic'] = subject
            item['url'] = url

            items.append(item)
        return items
예제 #3
0
 def parsePostsList(self, response):
     sel = Selector(response)
     html = response.body
     soup = BeautifulSoup(html)
     users = soup.findAll('a', {'class': 'username'})
     items = []
     topic = response.xpath('//h1/text()').extract()
     url = response.url
     for x in range(len(users)):
         item = PostItemsList()
         item['author'] = soup.findAll('a', {'class': 'username'})[x].text
         item['author_link'] = soup.findAll(
             'a', {'class': 'username'})[x]['href']
         item['create_date'] = soup.findAll(
             'div', {'class': 'post-content-inner'})[x].span.text[0:11]
         item['post'] = soup.findAll('div',
                                     {'class': 'post-content-inner'
                                      })[x].find('div', {
                                          'class': 'field-item even'
                                      }).text
         item['tag'] = 'cancer'
         item['topic'] = topic
         item['url'] = url
         logging.info(item.__str__)
         items.append(item)
     return items
예제 #4
0
    def parsePostsList(self, response):
        items = []
        url = response.url
        subject = response.xpath(
            '//li[@class="navbit lastnavbit"]/span/text()').extract()
        posts = response.xpath(
            '//li[@class="postbit postbitim postcontainer old"]')
        for post in posts:
            item = PostItemsList()
            author = post.xpath(
                './/a[contains(@class, "username")]/strong/text()')\
                .extract()[0]
            author_link = post.xpath(
                './/a[contains(@class, "username")]/@href').extract()[0]
            author_link = response.urljoin(author_link)
            create_date = " ".join(post.xpath(
                './/span[@class="date"]//text()').extract())
            message = " ".join(post.xpath(
                './/div[contains(@id, "post_message_")]//text()').extract())
            message = cleanText(message)

            item['author'] = author
            item['author_link'] = author_link
            item['create_date'] = create_date
            item['post'] = message
            item['tag'] = 'epilepsy'
            item['topic'] = subject
            item['url'] = url

            items.append(item)
        return items
예제 #5
0
    def topic_parse(self, response):
        print(response.url)
        items = []

        subject = response.xpath(
            '//div[@class="navbar"]/strong/text()').extract()[0]
        subject = subject.strip()
        url = response.url
        posts = response.xpath('//table[contains(@id, "post")]')

        for post in posts:
            item = PostItemsList()
            author = post.xpath(
                './/div[contains(@id, "postmenu")]/text()').extract()[0]
            author = author.strip()
            author_link = "*"
            create_date = post.xpath(
                './/td[@class="thead"]//text()').extract()[1].strip()

            message = ''.join(
                post.xpath('.//div[contains(@id, "post_message_")]//text()').
                extract()).strip()

            item['author'] = author
            item['author_link'] = author_link
            item['create_date'] = create_date
            item['post'] = message
            item['tag'] = 'epilepsy'
            item['topic'] = subject
            item['url'] = url

            items.append(item)
        return items
    def topic_parse(self, response):
        items = []

        subject = response.xpath('//title/text()').extract()[0]
        subject = subject.split('|')[0]
        url = response.url
        posts = response.xpath('//table//tr')[1:-1:2]

        for post in posts:
            item = PostItemsList()
            author = post.xpath(
                './/td[@class="xar-norm author"]/*/a/text()').extract()[0]
            author_link = post.xpath(
                './/td[@class="xar-norm author"]/*/a/@href').extract()[0]
            create_date = post.xpath(
                './/span[@class="xar-sub"][contains(text(), "Posted")]/text()'
            ).extract()[0]
            message = ''.join(post.xpath('.//div[2]/p//text()').extract())

            item['author'] = author
            item['author_link'] = author_link
            item['create_date'] = create_date
            item['post'] = message
            item['tag'] = 'epilepsy'
            item['topic'] = subject
            item['url'] = url

            items.append(item)
        return items
예제 #7
0
    def parse_item(self, response):
        items = []
        node_item = PostItemsList()
        subject = response.xpath(
            '//div[@class="left-corner"]/h2/text()').extract()[0]
        url = response.url
        node_post = response.xpath('//table[@class="node node-forum"]')
        node_author = node_post.xpath(
            './/div[@class="author"]/text()').extract()[0]
        node_time = u''.join(
            node_post.xpath(
                './/div[@class="date"]//text()').extract()).strip()
        node_message = u''.join(
            node_post.xpath('.//div[@class="content"]//text()').extract())
        posts = response.xpath('//table[@class="comment comment-forum"]')

        node_item['author'] = node_author
        node_item['author_link'] = '*'
        node_item['create_date'] = node_time
        node_item['post'] = node_message
        node_item['tag'] = 'epilepsy'
        node_item['topic'] = subject
        node_item['url'] = url

        items.append(node_item)

        for post in posts:
            item = PostItemsList()
            author = post.xpath('.//div[@class="author"]/text()').extract()[0]
            date = post.xpath('.//div[@class="date"]//text()').extract()[0]
            message = u''.join(
                post.xpath(
                    './/div[@class="content"]//text()').extract()).strip()

            item['author'] = author
            item['author_link'] = '*'
            item['create_date'] = date
            item['post'] = message
            item['tag'] = 'epilepsy'
            item['topic'] = subject
            item['url'] = url

            items.append(item)
        return items
	def get_sub_data(self,response):
		logging.info("get_sub_data")
		author_name_xpath = "//table[@class='discussion_topic']//p[@class='username']/a/text()"
		author_link_xpath = "//table[@class='discussion_topic']//p[@class='username']/a/@href"
		author_posted_xpath = "//table[@class='discussion_topic']//div/span[@class='graytext']/text()"
		author_all_text_xpath = "//table[@class='discussion_topic']//div[@class='discussion_text longtextfix485']/text()"

		author_name = response.xpath(author_name_xpath).extract()
		author_name = str(author_name[0])
		author_name = author_name.replace("\t","")

		author_name = author_name.replace(',',' ')
		author_link = response.xpath(author_link_xpath).extract()
		author_link  = author_link[0]
		author_link = "http://www.dailystrength.org%s"%author_link
		author_posted = response.xpath(author_posted_xpath).extract()
		author_posted = author_posted[0]
		author_posted = author_posted.replace(',','')
		author_posted = author_posted.replace('Posted on','')

		author_all_text = response.xpath(author_all_text_xpath).extract()
		author_all_text = str(author_all_text[0])
		author_all_text = author_all_text.replace(',','')
		author_all_text = author_all_text.replace('\t','')
		author_all_text = author_all_text.replace('  ','')
		author_all_text = author_all_text.replace('\n','')

		topic = response.xpath("//div[contains(@class,'discussion_topic_header_subject')]/text()").extract()[0]

		item = PostItemsList()

		item['author'] = author_name
		item['author_link'] = author_link
		item['condition']="chronic lymphocytic leukemia"
		item['create_date'] = author_posted
		item['post'] = author_all_text
		item['topic'] = topic
		item['url'] = response.url
		print(author_all_text)
		logging.info(item.__str__())
		yield item
    def parse_item(self, response):
        if "threads" in response.url:
            # little trick for allow use only two rules
            items = []
            condition="breast cancer"
            posts = response.xpath('//ol[@id="posts"]/li')
            url = response.url
            subject = response.xpath(
                '//span[@class="threadtitle"]//text()').extract()[0]
            for post in posts:
                item = PostItemsList()
                author = post.xpath(
                    './/a[contains(@class, "username")]//text()').extract()[0]
                author_link = post.xpath(
                    './/a[contains(@class, "username")]/@href').extract()[0]
                
                create_date = post.xpath(
                    './/span[@class="date"]//text()').extract()
                # clean create_date
                create_date = u" ".join(date.strip() for date in create_date)
                message = post.xpath(
                    './/div[@class="content"]//text()').extract()
                # clean message
                message = u"".join(msg.strip() for msg in message)
                message = self.cleanText(message)

                item['author'] = author
                item['author_link'] = author_link
                item['condition'] = condition
                item['create_date'] = self.getDate(create_date)
                item['domain'] = "".join(self.allowed_domains)
                item['post'] = message
                # item['tag'] = ''
                item['topic'] = subject
                item['url'] = url

                logging.info(item.__str__())
                items.append(item)

            return items
    def parsePostsList(self, response):
        items = []

        url = response.url
        subject = response.xpath(
            '//h1[@class="node-title"]/text()').extract()[0]
        original_author = response.xpath(
            '//div[@class="node-byline"]/text()').extract()[0]
        original_author_link = "no"
        original_create_date = response.xpath(
            '//span[@class="node-date"]/span/text()').extract()[0]
        original_message = " ".join(
            response.xpath(
                '//div[@class="field-item even"]//text()').extract())
        original_message = cleanText(original_message)

        item = PostItemsList()
        item['author'] = original_author
        item['author_link'] = original_author_link
        item['create_date'] = original_create_date
        item['post'] = original_message
        item['tag'] = 'epilepsy'
        item['topic'] = subject
        item['url'] = url

        items.append(item)

        posts = response.xpath('//article[contains(@class, "comment")]')
        for post in posts:
            author = post.xpath(
                './/div[@class="node-author"]/span/text()').extract()[0]
            author_link = "no"
            create_date = post.xpath(
                './/div[@class="node-date"]/time/text()').extract()[0]
            message = " ".join(
                post.xpath(
                    './/div[@class="field-item even"]//text()').extract())
            message = cleanText(message)

            item['author'] = author
            item['author_link'] = author_link
            item['create_date'] = create_date
            item['post'] = message
            item['tag'] = 'epilepsy'
            item['topic'] = subject
            item['url'] = url

            items.append(item)
        return items
    def parse_item(self, response):
        if "threads" in response.url:
            # little trick for allow use only two rules
            items = []

            posts = response.xpath('//ol[@id="posts"]/li')
            url = response.url
            subject = response.xpath(
                '//span[@class="threadtitle"]//text()').extract()[0]
            for post in posts:
                item = PostItemsList()
                author = post.xpath(
                    './/a[contains(@class, "username")]//text()').extract()[0]
                author_link = post.xpath(
                    './/a[contains(@class, "username")]/@href').extract()[0]
                create_date = post.xpath(
                    './/span[@class="date"]//text()').extract()
                # clean create_date
                create_date = u" ".join(date.strip() for date in create_date)
                message = post.xpath(
                    './/div[@class="content"]//text()').extract()
                # clean message
                message = u"".join(msg.strip() for msg in message)

                item['author'] = author
                item['author_link'] = author_link
                item['create_date'] = create_date
                item['post'] = message
                item['tag'] = 'epilepsy'
                item['topic'] = subject
                item['url'] = url

                logging.info(item.__str__())
                items.append(item)

            return items
 def parsePostsList(self,response):
     sel = Selector(response)
     posts = sel.css(".vt_post_holder")
     items = []
     topic = response.css('h1.caps').xpath('text()').extract()[0]
     url = response.url
     for post in posts:
         item = PostItemsList()
         item['author'] = post.css('.vt_asked_by_user').xpath("./a").xpath("text()").extract()[0]
         item['author_link']=post.css('.vt_asked_by_user').xpath("./a").xpath("@href").extract()[0]
         item['create_date']= post.css('.vt_first_timestamp').xpath('text()').extract().extend(response.css('.vt_reply_timestamp').xpath('text()').extract())
         item['post'] = re.sub('\s+',' '," ".join(post.css('.vt_post_body').xpath('text()').extract()).replace("\t","").replace("\n","").replace("\r",""))
         item['tag']='epilepsy'
         item['topic'] = topic
         item['url']=url
         logging.info(item.__str__)
         items.append(item)
     return items
    def parse_item(self, response):
        def clean_date(date, time):
            '''helper method for clean date'''
            date = date.replace(u'\u200e', '')
            return u' '.join([date, time])

        items = []

        url = response.url
        subject = response.xpath(
            '//div[@class="lia-message-subject"]//text()').extract()
        subject = ''.join([item.strip() for item in subject])
        posts = response.xpath(
            '//div[@class="lia-linear-display-message-view"]')
        for post in posts:
            item = PostItemsList()
            author = post.xpath(
                './/a[contains(@class, "lia-user-name-link")]//text()')\
                .extract()[0]
            author_link = post.xpath(
                './/a[contains(@class, "lia-user-name-link")]/@href')\
                .extract()[0]
            author_link = response.urljoin(author_link)
            create_date = post.xpath(
                './/span[@class="local-date"]/text()').extract()[1]
            create_time = post.xpath(
                './/span[@class="local-time"]//text()').extract()[0]
            create_date = clean_date(create_date, create_time)

            message = ''.join(
                post.xpath('.//div[@class="lia-message-body-content"]//text()'
                           ).extract()).strip()

            item['author'] = author
            item['author_link'] = author_link
            item['create_date'] = create_date
            item['post'] = message
            item['tag'] = 'epilepsy'
            item['topic'] = subject
            item['url'] = url

            items.append(item)
        return items
예제 #14
0
    def parse_item(self, response):
        items = []
        if 'all' in response.url:
            url = response.url
            subject = response.xpath(
                '//div[@class="contentText"]/h1/text()').extract()[0]
            posts = response.xpath('//div[@class="mbpost"]')
            for post in posts:
                item = PostItemsList()
                author = post.xpath(
                    './/p/a/*[2]/text()|.//div[@class="author"]/p/span/text()')\
                    .extract()[0]
                author_link = post.xpath('.//p/a/@href').extract()
                if author_link:
                    author_link = author_link[0]
                else:
                    author_link = 'anon'
                create_date = post.xpath(
                    './/div[@class="header"]/p//text()').extract()
                if len(create_date) > 2:
                    create_date = create_date[2].strip()
                    create_date = create_date[create_date.find('on') +
                                              2:].strip()
                else:
                    create_date = create_date[0]
                    create_date = create_date[create_date.find('on') +
                                              2:].strip()
                message = u''.join(
                    post.xpath(
                        './/div[@class="msgContent"]//text()').extract()[0])
                message = message.strip()

                item['author'] = author
                item['author_link'] = author_link
                item['create_date'] = create_date
                item['post'] = message
                item['tag'] = 'epilepsy'
                item['topic'] = subject
                item['url'] = url
                items.append(item)
            return items
예제 #15
0
    def parsePostsList(self, response):
        items = []
        subject = response.xpath('//div[@class="breadcrumb"]/text()')\
            .extract()

        subject = subject[3]
        url = response.url
        for post in response.xpath('//div[contains(@id, "post-")]'):
            item = PostItemsList()
            author = post.xpath(
                './/div[@class="author-pane-line author-name"]/a/text()')\
                .extract()
            author_link = post.xpath(
                './/div[@class="author-pane-line author-name"]/a/@href')\
                .extract()

            if len(author) == 0 and len(author_link) == 0:
                author = [u"anon"]
                author_link = [u"anon"]

            author = author[0]
            author_link = author_link[0]
            create_date = post.xpath(
                './/div[@class="forum-posted-on"]/text()')\
                .extract()[0].strip()
            message = " ".join(
                post.xpath(
                    './/div[@class="forum-post-content"]//text()').extract())
            message = cleanText(message)

            item['author'] = author
            item['author_link'] = author_link
            item['create_date'] = create_date
            item['post'] = message
            item['tag'] = 'epilepsy'
            item['topic'] = subject
            item['url'] = url

            items.append(item)
        return items
예제 #16
0
 def parse(self,response):
     logging.info(response)
     sel = Selector(response)
     posts = sel.css("Table.PostBox")
     items = []
     topic = response.xpath('//div[contains(@id,"PageTitle")]/h1/text()').extract()[0]
     url = response.url
     for post in posts:
         item = PostItemsList()
         item['author'] = post.css('.msgUser').xpath("./a[2]").xpath("text()").extract()[0]
         item['author_link']=post.css('.msgUser').xpath("./a[2]/@href").extract()[0]
         item['create_date']= re.sub(" +|\n|\r|\t|\0|\x0b|\xa0",' ',response.css('td.msgThreadInfo').xpath('text()').extract()[0]).strip()
         post_msg=post.css('.PostMessageBody').extract()[0]
         soup = BeautifulSoup(post_msg, 'html.parser')
         post_msg = re.sub(" +|\n|\r|\t|\0|\x0b|\xa0",' ',soup.get_text()).strip()
         item['post']=post_msg
         item['tag']='epilepsy'
         item['topic'] = topic
         item['url']=url
         logging.info(post_msg)
         items.append(item)
     return items
    def topic_parse(self, response):
        items = []
        subject = response.xpath(
            '//div[@class="for_title"]/text()').extract()[0].strip()
        next_page = response.xpath(
            '//a[@class="paging"][last()]/@href').extract()
        posts = response.xpath('//table[contains(@id, "msg_tbl")]')
        url = response.url
        for post in posts:
            item = PostItemsList()
            author = post.xpath('.//a[@class="titlehead"]/text()').extract()[0]
            author_link = post.xpath(
                './/a[@class="titlehead"]/@href').extract()[0]
            author_link = response.urljoin(author_link)

            message = "".join(
                post.xpath('.//div[@class="msg"]//text()').extract())
            message = message.strip()

            create_date = post.xpath(
                './/span[contains(@id, "date")]/text()').extract()[0].strip()

            item['author'] = author
            item['author_link'] = author_link
            item['create_date'] = create_date
            item['post'] = message
            item['tag'] = 'epilepsy'
            item['topic'] = subject
            item['url'] = url

            items.append(item)
        yield {"items": items}

        if len(next_page) > 0:
            next_page = response.urljoin(next_page[0])
            yield scrapy.Request(url,
                                 callback=self.topic_parse,
                                 cookies={"KomenForumApptimefilter": "0"})
예제 #18
0
 def parsePostsList(self, response):
     sel = Selector(response)
     html = response.body
     soup = BeautifulSoup(html)
     users = soup.findAll('a', {'class': re.compile('usergroup\d.*')})
     items = []
     topic = response.xpath(
         '//tbody/tr[2]/td[2]/table/tbody/tr[1]/td/div/b').extract()
     url = response.url
     for x in range(len(users)):
         item = PostItemsList()
         item['author'] = users[x].text
         item['author_link'] = users[x]['href']
         item['create_date'] = soup.findAll(
             'span', {'id': re.compile('posted_date_.*')})[x].text
         item['post'] = soup.findAll(
             'span', {'id': re.compile('post_message.*')})[x].text
         item['tag'] = 'cancer'
         item['topic'] = topic
         item['url'] = url
         logging.info(item.__str__)
         items.append(item)
     return items
 def parsePostsList(self, response):
     sel = Selector(response)
     posts = sel.xpath('//div[contains(@class,"disc-forums disc-thread")]')
     items = []
     topic = response.xpath("id('topic')/article/h1/text()").extract()
     url = response.url
     for post in posts:
         item = PostItemsList()
         item['author'] = post.xpath(
             '//div/div/a/p/strong[2]/text()').extract()
         item['author_link'] = post.xpath('//div/div/a/@href').re(
             '/forums/profiles.*')
         item['create_date'] = post.xpath(
             '//span[contains(@class,"post-meta")]/time/@datetime'
         )[0].extract()
         item['post'] = post.xpath(
             '//div[contains(@class,"post-content break-word")]/p[1]/text()'
         ).extract()
         item['tag'] = 'Breast Cancer and Screening'
         item['topic'] = topic
         item['url'] = url
         logging.info(item.__str__)
         items.append(item)
     return items
예제 #20
0
    def topic_parse(self, response):
        if 'discussions' not in response.url:
            items = []

            subject = response.xpath(
                '//div[@class="forum-stats-container"]/h1/text()').extract()[0]
            subject = subject.strip()
            url = response.url
            posts = response.xpath(
                '//div[@class="full-post-container fiji-full-post-container evolution2-full-post-container"]')

            for post in posts:
                item = PostItemsList()
                author = post.xpath(
                    './/span[@class="user-name"]/a/text()')\
                    .extract()[1].strip()
                author_link = post.xpath(
                    './/span[@class="user-name"]/a/@href').extract()[0]
                create_date = post.xpath(
                    './/a[@class="internal-link view-post"]/text()')\
                    .extract()[0]
                message = ' '.join(post.xpath(
                    './/div[@class="post-content user-defined-markup"]//text()'
                ).extract())
                message = message.strip()

                item['author'] = author
                item['author_link'] = author_link
                item['create_date'] = create_date
                item['post'] = message
                item['tag'] = 'epilepsy'
                item['topic'] = subject
                item['url'] = url

                items.append(item)
            return items
예제 #21
0
    def parse_item(self, response):
        items = []

        def clean_date(date):
            if len(date) > 1:
                date = date[0].split('\n')
                date = date[2]
                return date.strip()
            else:
                date = date[0].split('\n')[2]
                return date.strip()

        url = response.url
        subject = response.xpath(
            '//p[@id="crumbs"]/a[2]/text()').extract()[0].strip()
        original_author = response.xpath(
            '//div[@class="original-topic"]//div[@class="user-post"]/p/strong/a/text()')\
            .extract()[0]
        original_author_link = response.xpath(
            '//div[@class="original-topic"]//div[@class="user-post"]/p/strong/a/@href')\
            .extract()[0]
        original_create_date = response.xpath(
            '//div[@class="original-topic"]//span[@class="posted-time left"]//text()'
        ).extract()
        original_message = "".join(
            response.xpath(
                '//div[@class="original-topic"]//div[@class="user-post"]//text()'
            ).extract()).strip()
        posts = response.xpath(
            '//div[@class="post"]|//div[class="post secondary"]')

        item = PostItemsList()

        item['author'] = original_author
        item['author_link'] = original_author_link
        item['create_date'] = original_create_date
        item['post'] = original_message
        item['tag'] = 'epilepsy'
        item['topic'] = subject
        item['url'] = url
        items.append(item)

        for post in posts:
            author = post.xpath(
                './/div[@class="user-info"]/a/text()').extract()[0]
            author_link = post.xpath(
                './/div[@class="user-info"]/a/@href').extract()[0]
            author_link = response.urljoin(author_link)
            create_date = post.xpath(
                './/p[@class="post-time"]/strong/text()').extract()[0]
            message = post.xpath(
                './/div[@class="user-post"]//p[not(@class="post-time")]//text()'
            ).extract()
            message = "".join(message).strip()
            message = cleanText(message)

            item['author'] = author
            item['author_link'] = author_link
            item['create_date'] = create_date
            item['post'] = message
            item['tag'] = 'epilepsy'
            item['topic'] = subject
            item['url'] = url
            items.append(item)

        return items