def parse_item(self, response): if "threads" in response.url: # little trick for allow use only two rules items = [] posts = response.xpath('//ol[@id="posts"]/li') url = response.url subject = response.xpath('//span[@class="threadtitle"]//text()').extract()[0] for post in posts: item = PostItemsList() author = post.xpath('.//a[contains(@class, "username")]//text()').extract()[0] author_link = post.xpath('.//a[contains(@class, "username")]/@href').extract()[0] create_date = post.xpath('.//span[@class="date"]//text()').extract() # clean create_date create_date = u" ".join(date.strip() for date in create_date) message = post.xpath('.//div[@class="content"]//text()').extract() # clean message message = u"".join(msg.strip() for msg in message) item["author"] = author item["author_link"] = author_link item["create_date"] = create_date item["post"] = message item["tag"] = "epilepsy" item["topic"] = subject item["url"] = url logging.info(item.__str__()) items.append(item) return items
def parsePostsList(self, response): items = [] posts = response.xpath( '//div[@class="post bg2"] | //div[@class="post bg1"]') url = response.url subject = response.xpath('//div[@id="page-body"]/h2/a/text()')\ .extract()[0] for post in posts: author = post.xpath( './/p[@class="author"]/strong/a/text()').extract()[0] author_link = post.xpath( './/p[@class="author"]/strong/a/@href').extract()[0] create_date = post.xpath( './/p[@class="author"]/text()').extract()[1] create_date = create_date.replace(u" \xbb", u"") message = " ".join( post.xpath('.//div[@class="content"]//text()').extract()) message = cleanText(message) item = PostItemsList() item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) return items
def parsePostsList(self, response): sel = Selector(response) html = response.body soup = BeautifulSoup(html) users = soup.findAll('a', {'class': 'username'}) items = [] topic = response.xpath('//h1/text()').extract() url = response.url for x in range(len(users)): item = PostItemsList() item['author'] = soup.findAll('a', {'class': 'username'})[x].text item['author_link'] = soup.findAll( 'a', {'class': 'username'})[x]['href'] item['create_date'] = soup.findAll( 'div', {'class': 'post-content-inner'})[x].span.text[0:11] item['post'] = soup.findAll('div', {'class': 'post-content-inner' })[x].find('div', { 'class': 'field-item even' }).text item['tag'] = 'cancer' item['topic'] = topic item['url'] = url logging.info(item.__str__) items.append(item) return items
def parsePostsList(self, response): items = [] url = response.url subject = response.xpath( '//li[@class="navbit lastnavbit"]/span/text()').extract() posts = response.xpath( '//li[@class="postbit postbitim postcontainer old"]') for post in posts: item = PostItemsList() author = post.xpath( './/a[contains(@class, "username")]/strong/text()')\ .extract()[0] author_link = post.xpath( './/a[contains(@class, "username")]/@href').extract()[0] author_link = response.urljoin(author_link) create_date = " ".join(post.xpath( './/span[@class="date"]//text()').extract()) message = " ".join(post.xpath( './/div[contains(@id, "post_message_")]//text()').extract()) message = cleanText(message) item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) return items
def topic_parse(self, response): print(response.url) items = [] subject = response.xpath( '//div[@class="navbar"]/strong/text()').extract()[0] subject = subject.strip() url = response.url posts = response.xpath('//table[contains(@id, "post")]') for post in posts: item = PostItemsList() author = post.xpath( './/div[contains(@id, "postmenu")]/text()').extract()[0] author = author.strip() author_link = "*" create_date = post.xpath( './/td[@class="thead"]//text()').extract()[1].strip() message = ''.join( post.xpath('.//div[contains(@id, "post_message_")]//text()'). extract()).strip() item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) return items
def topic_parse(self, response): items = [] subject = response.xpath('//title/text()').extract()[0] subject = subject.split('|')[0] url = response.url posts = response.xpath('//table//tr')[1:-1:2] for post in posts: item = PostItemsList() author = post.xpath( './/td[@class="xar-norm author"]/*/a/text()').extract()[0] author_link = post.xpath( './/td[@class="xar-norm author"]/*/a/@href').extract()[0] create_date = post.xpath( './/span[@class="xar-sub"][contains(text(), "Posted")]/text()' ).extract()[0] message = ''.join(post.xpath('.//div[2]/p//text()').extract()) item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) return items
def parse_item(self, response): items = [] node_item = PostItemsList() subject = response.xpath( '//div[@class="left-corner"]/h2/text()').extract()[0] url = response.url node_post = response.xpath('//table[@class="node node-forum"]') node_author = node_post.xpath( './/div[@class="author"]/text()').extract()[0] node_time = u''.join( node_post.xpath( './/div[@class="date"]//text()').extract()).strip() node_message = u''.join( node_post.xpath('.//div[@class="content"]//text()').extract()) posts = response.xpath('//table[@class="comment comment-forum"]') node_item['author'] = node_author node_item['author_link'] = '*' node_item['create_date'] = node_time node_item['post'] = node_message node_item['tag'] = 'epilepsy' node_item['topic'] = subject node_item['url'] = url items.append(node_item) for post in posts: item = PostItemsList() author = post.xpath('.//div[@class="author"]/text()').extract()[0] date = post.xpath('.//div[@class="date"]//text()').extract()[0] message = u''.join( post.xpath( './/div[@class="content"]//text()').extract()).strip() item['author'] = author item['author_link'] = '*' item['create_date'] = date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) return items
def get_sub_data(self,response): logging.info("get_sub_data") author_name_xpath = "//table[@class='discussion_topic']//p[@class='username']/a/text()" author_link_xpath = "//table[@class='discussion_topic']//p[@class='username']/a/@href" author_posted_xpath = "//table[@class='discussion_topic']//div/span[@class='graytext']/text()" author_all_text_xpath = "//table[@class='discussion_topic']//div[@class='discussion_text longtextfix485']/text()" author_name = response.xpath(author_name_xpath).extract() author_name = str(author_name[0]) author_name = author_name.replace("\t","") author_name = author_name.replace(',',' ') author_link = response.xpath(author_link_xpath).extract() author_link = author_link[0] author_link = "http://www.dailystrength.org%s"%author_link author_posted = response.xpath(author_posted_xpath).extract() author_posted = author_posted[0] author_posted = author_posted.replace(',','') author_posted = author_posted.replace('Posted on','') author_all_text = response.xpath(author_all_text_xpath).extract() author_all_text = str(author_all_text[0]) author_all_text = author_all_text.replace(',','') author_all_text = author_all_text.replace('\t','') author_all_text = author_all_text.replace(' ','') author_all_text = author_all_text.replace('\n','') topic = response.xpath("//div[contains(@class,'discussion_topic_header_subject')]/text()").extract()[0] item = PostItemsList() item['author'] = author_name item['author_link'] = author_link item['condition']="chronic lymphocytic leukemia" item['create_date'] = author_posted item['post'] = author_all_text item['topic'] = topic item['url'] = response.url print(author_all_text) logging.info(item.__str__()) yield item
def parse_item(self, response): if "threads" in response.url: # little trick for allow use only two rules items = [] condition="breast cancer" posts = response.xpath('//ol[@id="posts"]/li') url = response.url subject = response.xpath( '//span[@class="threadtitle"]//text()').extract()[0] for post in posts: item = PostItemsList() author = post.xpath( './/a[contains(@class, "username")]//text()').extract()[0] author_link = post.xpath( './/a[contains(@class, "username")]/@href').extract()[0] create_date = post.xpath( './/span[@class="date"]//text()').extract() # clean create_date create_date = u" ".join(date.strip() for date in create_date) message = post.xpath( './/div[@class="content"]//text()').extract() # clean message message = u"".join(msg.strip() for msg in message) message = self.cleanText(message) item['author'] = author item['author_link'] = author_link item['condition'] = condition item['create_date'] = self.getDate(create_date) item['domain'] = "".join(self.allowed_domains) item['post'] = message # item['tag'] = '' item['topic'] = subject item['url'] = url logging.info(item.__str__()) items.append(item) return items
def parsePostsList(self, response): items = [] url = response.url subject = response.xpath( '//h1[@class="node-title"]/text()').extract()[0] original_author = response.xpath( '//div[@class="node-byline"]/text()').extract()[0] original_author_link = "no" original_create_date = response.xpath( '//span[@class="node-date"]/span/text()').extract()[0] original_message = " ".join( response.xpath( '//div[@class="field-item even"]//text()').extract()) original_message = cleanText(original_message) item = PostItemsList() item['author'] = original_author item['author_link'] = original_author_link item['create_date'] = original_create_date item['post'] = original_message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) posts = response.xpath('//article[contains(@class, "comment")]') for post in posts: author = post.xpath( './/div[@class="node-author"]/span/text()').extract()[0] author_link = "no" create_date = post.xpath( './/div[@class="node-date"]/time/text()').extract()[0] message = " ".join( post.xpath( './/div[@class="field-item even"]//text()').extract()) message = cleanText(message) item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) return items
def parse_item(self, response): if "threads" in response.url: # little trick for allow use only two rules items = [] posts = response.xpath('//ol[@id="posts"]/li') url = response.url subject = response.xpath( '//span[@class="threadtitle"]//text()').extract()[0] for post in posts: item = PostItemsList() author = post.xpath( './/a[contains(@class, "username")]//text()').extract()[0] author_link = post.xpath( './/a[contains(@class, "username")]/@href').extract()[0] create_date = post.xpath( './/span[@class="date"]//text()').extract() # clean create_date create_date = u" ".join(date.strip() for date in create_date) message = post.xpath( './/div[@class="content"]//text()').extract() # clean message message = u"".join(msg.strip() for msg in message) item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url logging.info(item.__str__()) items.append(item) return items
def parsePostsList(self,response): sel = Selector(response) posts = sel.css(".vt_post_holder") items = [] topic = response.css('h1.caps').xpath('text()').extract()[0] url = response.url for post in posts: item = PostItemsList() item['author'] = post.css('.vt_asked_by_user').xpath("./a").xpath("text()").extract()[0] item['author_link']=post.css('.vt_asked_by_user').xpath("./a").xpath("@href").extract()[0] item['create_date']= post.css('.vt_first_timestamp').xpath('text()').extract().extend(response.css('.vt_reply_timestamp').xpath('text()').extract()) item['post'] = re.sub('\s+',' '," ".join(post.css('.vt_post_body').xpath('text()').extract()).replace("\t","").replace("\n","").replace("\r","")) item['tag']='epilepsy' item['topic'] = topic item['url']=url logging.info(item.__str__) items.append(item) return items
def parse_item(self, response): def clean_date(date, time): '''helper method for clean date''' date = date.replace(u'\u200e', '') return u' '.join([date, time]) items = [] url = response.url subject = response.xpath( '//div[@class="lia-message-subject"]//text()').extract() subject = ''.join([item.strip() for item in subject]) posts = response.xpath( '//div[@class="lia-linear-display-message-view"]') for post in posts: item = PostItemsList() author = post.xpath( './/a[contains(@class, "lia-user-name-link")]//text()')\ .extract()[0] author_link = post.xpath( './/a[contains(@class, "lia-user-name-link")]/@href')\ .extract()[0] author_link = response.urljoin(author_link) create_date = post.xpath( './/span[@class="local-date"]/text()').extract()[1] create_time = post.xpath( './/span[@class="local-time"]//text()').extract()[0] create_date = clean_date(create_date, create_time) message = ''.join( post.xpath('.//div[@class="lia-message-body-content"]//text()' ).extract()).strip() item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) return items
def parse_item(self, response): items = [] if 'all' in response.url: url = response.url subject = response.xpath( '//div[@class="contentText"]/h1/text()').extract()[0] posts = response.xpath('//div[@class="mbpost"]') for post in posts: item = PostItemsList() author = post.xpath( './/p/a/*[2]/text()|.//div[@class="author"]/p/span/text()')\ .extract()[0] author_link = post.xpath('.//p/a/@href').extract() if author_link: author_link = author_link[0] else: author_link = 'anon' create_date = post.xpath( './/div[@class="header"]/p//text()').extract() if len(create_date) > 2: create_date = create_date[2].strip() create_date = create_date[create_date.find('on') + 2:].strip() else: create_date = create_date[0] create_date = create_date[create_date.find('on') + 2:].strip() message = u''.join( post.xpath( './/div[@class="msgContent"]//text()').extract()[0]) message = message.strip() item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) return items
def parsePostsList(self, response): items = [] subject = response.xpath('//div[@class="breadcrumb"]/text()')\ .extract() subject = subject[3] url = response.url for post in response.xpath('//div[contains(@id, "post-")]'): item = PostItemsList() author = post.xpath( './/div[@class="author-pane-line author-name"]/a/text()')\ .extract() author_link = post.xpath( './/div[@class="author-pane-line author-name"]/a/@href')\ .extract() if len(author) == 0 and len(author_link) == 0: author = [u"anon"] author_link = [u"anon"] author = author[0] author_link = author_link[0] create_date = post.xpath( './/div[@class="forum-posted-on"]/text()')\ .extract()[0].strip() message = " ".join( post.xpath( './/div[@class="forum-post-content"]//text()').extract()) message = cleanText(message) item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) return items
def parse(self,response): logging.info(response) sel = Selector(response) posts = sel.css("Table.PostBox") items = [] topic = response.xpath('//div[contains(@id,"PageTitle")]/h1/text()').extract()[0] url = response.url for post in posts: item = PostItemsList() item['author'] = post.css('.msgUser').xpath("./a[2]").xpath("text()").extract()[0] item['author_link']=post.css('.msgUser').xpath("./a[2]/@href").extract()[0] item['create_date']= re.sub(" +|\n|\r|\t|\0|\x0b|\xa0",' ',response.css('td.msgThreadInfo').xpath('text()').extract()[0]).strip() post_msg=post.css('.PostMessageBody').extract()[0] soup = BeautifulSoup(post_msg, 'html.parser') post_msg = re.sub(" +|\n|\r|\t|\0|\x0b|\xa0",' ',soup.get_text()).strip() item['post']=post_msg item['tag']='epilepsy' item['topic'] = topic item['url']=url logging.info(post_msg) items.append(item) return items
def topic_parse(self, response): items = [] subject = response.xpath( '//div[@class="for_title"]/text()').extract()[0].strip() next_page = response.xpath( '//a[@class="paging"][last()]/@href').extract() posts = response.xpath('//table[contains(@id, "msg_tbl")]') url = response.url for post in posts: item = PostItemsList() author = post.xpath('.//a[@class="titlehead"]/text()').extract()[0] author_link = post.xpath( './/a[@class="titlehead"]/@href').extract()[0] author_link = response.urljoin(author_link) message = "".join( post.xpath('.//div[@class="msg"]//text()').extract()) message = message.strip() create_date = post.xpath( './/span[contains(@id, "date")]/text()').extract()[0].strip() item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) yield {"items": items} if len(next_page) > 0: next_page = response.urljoin(next_page[0]) yield scrapy.Request(url, callback=self.topic_parse, cookies={"KomenForumApptimefilter": "0"})
def parsePostsList(self, response): sel = Selector(response) html = response.body soup = BeautifulSoup(html) users = soup.findAll('a', {'class': re.compile('usergroup\d.*')}) items = [] topic = response.xpath( '//tbody/tr[2]/td[2]/table/tbody/tr[1]/td/div/b').extract() url = response.url for x in range(len(users)): item = PostItemsList() item['author'] = users[x].text item['author_link'] = users[x]['href'] item['create_date'] = soup.findAll( 'span', {'id': re.compile('posted_date_.*')})[x].text item['post'] = soup.findAll( 'span', {'id': re.compile('post_message.*')})[x].text item['tag'] = 'cancer' item['topic'] = topic item['url'] = url logging.info(item.__str__) items.append(item) return items
def parsePostsList(self, response): sel = Selector(response) posts = sel.xpath('//div[contains(@class,"disc-forums disc-thread")]') items = [] topic = response.xpath("id('topic')/article/h1/text()").extract() url = response.url for post in posts: item = PostItemsList() item['author'] = post.xpath( '//div/div/a/p/strong[2]/text()').extract() item['author_link'] = post.xpath('//div/div/a/@href').re( '/forums/profiles.*') item['create_date'] = post.xpath( '//span[contains(@class,"post-meta")]/time/@datetime' )[0].extract() item['post'] = post.xpath( '//div[contains(@class,"post-content break-word")]/p[1]/text()' ).extract() item['tag'] = 'Breast Cancer and Screening' item['topic'] = topic item['url'] = url logging.info(item.__str__) items.append(item) return items
def topic_parse(self, response): if 'discussions' not in response.url: items = [] subject = response.xpath( '//div[@class="forum-stats-container"]/h1/text()').extract()[0] subject = subject.strip() url = response.url posts = response.xpath( '//div[@class="full-post-container fiji-full-post-container evolution2-full-post-container"]') for post in posts: item = PostItemsList() author = post.xpath( './/span[@class="user-name"]/a/text()')\ .extract()[1].strip() author_link = post.xpath( './/span[@class="user-name"]/a/@href').extract()[0] create_date = post.xpath( './/a[@class="internal-link view-post"]/text()')\ .extract()[0] message = ' '.join(post.xpath( './/div[@class="post-content user-defined-markup"]//text()' ).extract()) message = message.strip() item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) return items
def parse_item(self, response): items = [] def clean_date(date): if len(date) > 1: date = date[0].split('\n') date = date[2] return date.strip() else: date = date[0].split('\n')[2] return date.strip() url = response.url subject = response.xpath( '//p[@id="crumbs"]/a[2]/text()').extract()[0].strip() original_author = response.xpath( '//div[@class="original-topic"]//div[@class="user-post"]/p/strong/a/text()')\ .extract()[0] original_author_link = response.xpath( '//div[@class="original-topic"]//div[@class="user-post"]/p/strong/a/@href')\ .extract()[0] original_create_date = response.xpath( '//div[@class="original-topic"]//span[@class="posted-time left"]//text()' ).extract() original_message = "".join( response.xpath( '//div[@class="original-topic"]//div[@class="user-post"]//text()' ).extract()).strip() posts = response.xpath( '//div[@class="post"]|//div[class="post secondary"]') item = PostItemsList() item['author'] = original_author item['author_link'] = original_author_link item['create_date'] = original_create_date item['post'] = original_message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) for post in posts: author = post.xpath( './/div[@class="user-info"]/a/text()').extract()[0] author_link = post.xpath( './/div[@class="user-info"]/a/@href').extract()[0] author_link = response.urljoin(author_link) create_date = post.xpath( './/p[@class="post-time"]/strong/text()').extract()[0] message = post.xpath( './/div[@class="user-post"]//p[not(@class="post-time")]//text()' ).extract() message = "".join(message).strip() message = cleanText(message) item['author'] = author item['author_link'] = author_link item['create_date'] = create_date item['post'] = message item['tag'] = 'epilepsy' item['topic'] = subject item['url'] = url items.append(item) return items