def parsePostsList(self, response): items = [] url = response.url subject = response.xpath( '//h1[@class="node-title"]/text()').extract()[0] original_author = response.xpath( '//div[@class="node-byline"]/text()').extract()[0] original_author_link = "no" original_create_date = response.xpath( '//span[@class="node-date"]/span/text()').extract()[0] original_message = " ".join( response.xpath( '//div[@class="field-item even"]//text()').extract()) original_message = cleanText(original_message) item = PostItemsList() item['author'] = original_author item['author_link'] = original_author_link item['create_date'] = original_create_date item['post'] = original_message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) posts = response.xpath('//article[contains(@class, "comment")]') for post in posts: author = post.xpath( './/div[@class="node-author"]/span/text()').extract()[0] author_link = "no" create_date = post.xpath( './/div[@class="node-date"]/time/text()').extract()[0] message = " ".join( post.xpath( './/div[@class="field-item even"]//text()').extract()) message = cleanText(message) item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) return items
def parsePostsList(self, response): items = [] url = response.url posts = response.xpath('//div[@class="post_data has_bg_color"]')[1:] subject = response.url.split("/")[5].replace("-", " ") for post in posts: item = PostItemsList() author = post.xpath( './/a[contains(@id, "user_")]/text()').extract()[0] author_link = post.xpath( './/a[contains(@id, "user_")]/@href').extract()[0] author_link = response.urljoin(author_link) create_date = post.xpath( './/div[@class="user_info user_info_comment"]/div[@class="float_fix"]//text()')\ .extract()[-2].strip() message = " ".join(post.xpath( './/div[@class="KonaBody"]//text()').extract()) message = cleanText(message) item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) return items
def parsePostsList(self, response): items = [] posts = response.xpath('//div[@class="post bg2"] | //div[@class="post bg1"]') url = response.url subject = response.xpath('//div[@id="page-body"]/h2/a/text()').extract()[0] for post in posts: author = post.xpath('.//p[@class="author"]/strong/a/text()').extract()[0] author_link = post.xpath('.//p[@class="author"]/strong/a/@href').extract()[0] create_date = post.xpath('.//p[@class="author"]/text()').extract()[1] create_date = create_date.replace(u" \xbb", u"") message = " ".join(post.xpath('.//div[@class="content"]//text()').extract()) message = cleanText(message) item = PostItemsList() item["author"] = author item["author_link"] = author_link item["create_date"] = create_date item["post"] = message item["tag"] = "epilepsy" item["topic"] = subject item["url"] = url items.append(item) return items
def parsePostsList(self, response): items = [] posts = response.xpath( '//div[@class="post bg2"] | //div[@class="post bg1"]') url = response.url subject = response.xpath('//div[@id="page-body"]/h2/a/text()')\ .extract()[0] for post in posts: author = post.xpath( './/p[@class="author"]/strong/a/text()').extract()[0] author_link = post.xpath( './/p[@class="author"]/strong/a/@href').extract()[0] create_date = post.xpath( './/p[@class="author"]/text()').extract()[1] create_date = create_date.replace(u" \xbb", u"") message = " ".join( post.xpath('.//div[@class="content"]//text()').extract()) message = cleanText(message) item = PostItemsList() item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) return items
def parsePostsList(self, response): items = [] url = response.url subject = response.xpath( '//li[@class="navbit lastnavbit"]/span/text()').extract() posts = response.xpath( '//li[@class="postbit postbitim postcontainer old"]') for post in posts: item = PostItemsList() author = post.xpath( './/a[contains(@class, "username")]/strong/text()')\ .extract()[0] author_link = post.xpath( './/a[contains(@class, "username")]/@href').extract()[0] author_link = response.urljoin(author_link) create_date = " ".join(post.xpath( './/span[@class="date"]//text()').extract()) message = " ".join(post.xpath( './/div[contains(@id, "post_message_")]//text()').extract()) message = cleanText(message) item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) return items
def parsePostsList(self, response): items = [] url = response.url subject = response.xpath( '//h1[@class="node-title"]/text()').extract()[0] original_author = response.xpath( '//div[@class="node-byline"]/text()').extract()[0] original_author_link = "no" original_create_date = response.xpath( '//span[@class="node-date"]/span/text()').extract()[0] original_message = " ".join(response.xpath( '//div[@class="field-item even"]//text()').extract()) original_message = cleanText(original_message) item = PostItemsList() item['author'] = original_author item['author_link'] = original_author_link item['create_date'] = original_create_date item['post'] = original_message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) posts = response.xpath('//article[contains(@class, "comment")]') for post in posts: author = post.xpath( './/div[@class="node-author"]/span/text()').extract()[0] author_link = "no" create_date = post.xpath( './/div[@class="node-date"]/time/text()').extract()[0] message = " ".join(post.xpath( './/div[@class="field-item even"]//text()').extract()) message = cleanText(message) item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) return items
def parse_item(self, response): items = [] def clean_date(date): if len(date) > 1: date = date[0].split("\n") date = date[2] return date.strip() else: date = date[0].split("\n")[2] return date.strip() url = response.url subject = response.xpath('//p[@id="crumbs"]/a[2]/text()').extract()[0].strip() original_author = response.xpath( '//div[@class="original-topic"]//div[@class="user-post"]/p/strong/a/text()' ).extract()[0] original_author_link = response.xpath( '//div[@class="original-topic"]//div[@class="user-post"]/p/strong/a/@href' ).extract()[0] original_create_date = response.xpath( '//div[@class="original-topic"]//span[@class="posted-time left"]//text()' ).extract() original_message = "".join( response.xpath('//div[@class="original-topic"]//div[@class="user-post"]//text()').extract() ).strip() posts = response.xpath('//div[@class="post"]|//div[class="post secondary"]') item = PostItemsList() item["author"] = original_author item["author_link"] = original_author_link item["create_date"] = original_create_date item["post"] = original_message item["tag"] = "epilepsy" item["topic"] = subject item["url"] = url items.append(item) for post in posts: author = post.xpath('.//div[@class="user-info"]/a/text()').extract()[0] author_link = post.xpath('.//div[@class="user-info"]/a/@href').extract()[0] author_link = response.urljoin(author_link) create_date = post.xpath('.//p[@class="post-time"]/strong/text()').extract()[0] message = post.xpath('.//div[@class="user-post"]//p[not(@class="post-time")]//text()').extract() message = "".join(message).strip() message = cleanText(message) item["author"] = author item["author_link"] = author_link item["create_date"] = create_date item["post"] = message item["tag"] = "epilepsy" item["topic"] = subject item["url"] = url items.append(item) return items
def parsePostsList(self, response): items = [] subject = response.xpath('//div[@class="breadcrumb"]/text()')\ .extract() subject = subject[3] url = response.url for post in response.xpath('//div[contains(@id, "post-")]'): item = PostItemsList() author = post.xpath( './/div[@class="author-pane-line author-name"]/a/text()')\ .extract() author_link = post.xpath( './/div[@class="author-pane-line author-name"]/a/@href')\ .extract() if len(author) == 0 and len(author_link) == 0: author = [u"anon"] author_link = [u"anon"] author = author[0] author_link = author_link[0] create_date = post.xpath( './/div[@class="forum-posted-on"]/text()')\ .extract()[0].strip() message = " ".join( post.xpath('.//div[@class="forum-post-content"]//text()') .extract()) message = cleanText(message) item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) return items
def parsePostsList(self, response): items = [] subject = response.xpath('//div[@class="breadcrumb"]/text()')\ .extract() subject = subject[3] url = response.url for post in response.xpath('//div[contains(@id, "post-")]'): item = PostItemsList() author = post.xpath( './/div[@class="author-pane-line author-name"]/a/text()')\ .extract() author_link = post.xpath( './/div[@class="author-pane-line author-name"]/a/@href')\ .extract() if len(author) == 0 and len(author_link) == 0: author = [u"anon"] author_link = [u"anon"] author = author[0] author_link = author_link[0] create_date = post.xpath( './/div[@class="forum-posted-on"]/text()')\ .extract()[0].strip() message = " ".join( post.xpath( './/div[@class="forum-post-content"]//text()').extract()) message = cleanText(message) item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) return items
def parse_item(self, response): items = [] def clean_date(date): if len(date) > 1: date = date[0].split('\n') date = date[2] return date.strip() else: date = date[0].split('\n')[2] return date.strip() url = response.url subject = response.xpath( '//p[@id="crumbs"]/a[2]/text()').extract()[0].strip() original_author = response.xpath( '//div[@class="original-topic"]//div[@class="user-post"]/p/strong/a/text()')\ .extract()[0] original_author_link = response.xpath( '//div[@class="original-topic"]//div[@class="user-post"]/p/strong/a/@href')\ .extract()[0] original_create_date = response.xpath( '//div[@class="original-topic"]//span[@class="posted-time left"]//text()' ).extract() original_message = "".join( response.xpath( '//div[@class="original-topic"]//div[@class="user-post"]//text()' ).extract()).strip() posts = response.xpath( '//div[@class="post"]|//div[class="post secondary"]') item = PostItemsList() item['author'] = original_author item['author_link'] = original_author_link item['create_date'] = original_create_date item['post'] = original_message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) for post in posts: author = post.xpath( './/div[@class="user-info"]/a/text()').extract()[0] author_link = post.xpath( './/div[@class="user-info"]/a/@href').extract()[0] author_link = response.urljoin(author_link) create_date = post.xpath( './/p[@class="post-time"]/strong/text()').extract()[0] message = post.xpath( './/div[@class="user-post"]//p[not(@class="post-time")]//text()' ).extract() message = "".join(message).strip() message = cleanText(message) item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) return items