def parse(self, response): def createItem(date, text, url, ctr): item = dict() item['title'] = 'Maklumat Gangguan Bekalan Air (' + str( ctr) + ') ' + date item['text'] = text item['page_link'] = response.url item['file_link'] = '' item['date'] = date return item trs = response.xpath("//table/tr[*]") url = response.url ctr = 0 for tr in trs: isDateExist = tr.xpath( "td[*]/strong/text()").extract_first() is not None isTextExist = tr.xpath("td[*]/p").extract_first() is not None if isDateExist is True and isTextExist is True: ctr = ctr + 1 date = tr.xpath("td[*]/strong/text()").extract_first() text = tr.xpath("td[*]/p").extract_first() thisItem = createItem(date=date, text=text, url=url, ctr=ctr) main = dict() main['category'] = 'SAINS' main['cat_desc'] = 'Syarikat Air Negeri Sembilan' main['content'] = thisItem jsonstr = json.dumps(main) # print(jsonstr) # send content to Cache save_to_DB(jsonstr)
def parse_full_article(self, response): # open the path and scrap text = "" content = response.xpath("//article/div/p") for subcontent in content: textincontent = subcontent.xpath("text()").extract() if (len(textincontent) > 0): text = text + textincontent[0] title = re.sub( pattern=r'\n+|\t+', repl='', string=response.xpath("//article/h1/text()").extract_first()) item = dict() item['title'] = title item['text'] = text item['page_link'] = response.url item['file_link'] = '' item['date'] = '' main = dict() main['category'] = 'SAINS' main['cat_desc'] = 'Syarikat Air Negeri Sembilan' main['content'] = item jsonstr = json.dumps(main) # print(jsonstr) # send content to Cache save_to_DB(jsonstr)
articles = soup.find_all('article') title = str(articles[0].p.get_text()) # article content def get_content(soupObject): formattedContent = soupObject.prettify(formatter="html") return re.sub(pattern=r'\n', repl='', string=formattedContent) contents = [] item = dict() item['date'] = '' item['title'] = title item['text'] = get_content(articles[0]) item['file_link'] = '' item['page_link'] = urlSAP contents.append(item) # wrap to JSON for content in contents: main = dict() main['category'] = 'SAP' main['cat_desc'] = 'Syarikat Air Perlis' main['content'] = content jsonstr = json.dumps(main) # send content to Cache save_to_DB(jsonstr)