def parse_item(self, response): if "threads" in response.url: # little trick for allow use only two rules items = [] posts = response.xpath('//ol[@id="posts"]/li') url = response.url subject = response.xpath('//span[@class="threadtitle"]//text()').extract()[0] for post in posts: item = PostItemsList() author = post.xpath('.//a[contains(@class, "username")]//text()').extract()[0] author_link = post.xpath('.//a[contains(@class, "username")]/@href').extract()[0] create_date = post.xpath('.//span[@class="date"]//text()').extract() # clean create_date create_date = u" ".join(date.strip() for date in create_date) message = post.xpath('.//div[@class="content"]//text()').extract() # clean message message = u"".join(msg.strip() for msg in message) item["author"] = author item["author_link"] = author_link item["create_date"] = create_date item["post"] = message item["tag"] = "epilepsy" item["topic"] = subject item["url"] = url logging.info(item.__str__()) items.append(item) return items
def get_sub_data(self,response): logging.info("get_sub_data") author_name_xpath = "//table[@class='discussion_topic']//p[@class='username']/a/text()" author_link_xpath = "//table[@class='discussion_topic']//p[@class='username']/a/@href" author_posted_xpath = "//table[@class='discussion_topic']//div/span[@class='graytext']/text()" author_all_text_xpath = "//table[@class='discussion_topic']//div[@class='discussion_text longtextfix485']/text()" author_name = response.xpath(author_name_xpath).extract() author_name = str(author_name[0]) author_name = author_name.replace("\t","") author_name = author_name.replace(',',' ') author_link = response.xpath(author_link_xpath).extract() author_link = author_link[0] author_link = "http://www.dailystrength.org%s"%author_link author_posted = response.xpath(author_posted_xpath).extract() author_posted = author_posted[0] author_posted = author_posted.replace(',','') author_posted = author_posted.replace('Posted on','') author_all_text = response.xpath(author_all_text_xpath).extract() author_all_text = str(author_all_text[0]) author_all_text = author_all_text.replace(',','') author_all_text = author_all_text.replace('\t','') author_all_text = author_all_text.replace(' ','') author_all_text = author_all_text.replace('\n','') topic = response.xpath("//div[contains(@class,'discussion_topic_header_subject')]/text()").extract()[0] item = PostItemsList() item['author'] = author_name item['author_link'] = author_link item['condition']="chronic lymphocytic leukemia" item['create_date'] = author_posted item['post'] = author_all_text item['topic'] = topic item['url'] = response.url print(author_all_text) logging.info(item.__str__()) yield item
def parse_item(self, response): if "threads" in response.url: # little trick for allow use only two rules items = [] condition="breast cancer" posts = response.xpath('//ol[@id="posts"]/li') url = response.url subject = response.xpath( '//span[@class="threadtitle"]//text()').extract()[0] for post in posts: item = PostItemsList() author = post.xpath( './/a[contains(@class, "username")]//text()').extract()[0] author_link = post.xpath( './/a[contains(@class, "username")]/@href').extract()[0] create_date = post.xpath( './/span[@class="date"]//text()').extract() # clean create_date create_date = u" ".join(date.strip() for date in create_date) message = post.xpath( './/div[@class="content"]//text()').extract() # clean message message = u"".join(msg.strip() for msg in message) message = self.cleanText(message) item['author'] = author item['author_link'] = author_link item['condition'] = condition item['create_date'] = self.getDate(create_date) item['domain'] = "".join(self.allowed_domains) item['post'] = message # item['tag'] = '' item['topic'] = subject item['url'] = url logging.info(item.__str__()) items.append(item) return items
def parse_item(self, response): if "threads" in response.url: # little trick for allow use only two rules items = [] posts = response.xpath('//ol[@id="posts"]/li') url = response.url subject = response.xpath( '//span[@class="threadtitle"]//text()').extract()[0] for post in posts: item = PostItemsList() author = post.xpath( './/a[contains(@class, "username")]//text()').extract()[0] author_link = post.xpath( './/a[contains(@class, "username")]/@href').extract()[0] create_date = post.xpath( './/span[@class="date"]//text()').extract() # clean create_date create_date = u" ".join(date.strip() for date in create_date) message = post.xpath( './/div[@class="content"]//text()').extract() # clean message message = u"".join(msg.strip() for msg in message) item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url logging.info(item.__str__()) items.append(item) return items