def parse(self, response): # print(self.task_id) content = response.text try: data = json.loads(content) items = data['items'] except: yield Request(response.url, meta={'key': response.meta['key']}, callback=self.parse, dont_filter=True) return equal = response.url.rfind('=') + 1 page_num = int(response.url.split('=')[-1]) if page_num == 1: for i in range(2, int(data['total'] / 20) + 2): yield Request(response.url[0:equal] + str(i), meta={'key': response.meta['key']}, callback=self.parse, dont_filter=True) # # for i in range(paginator['pages']): for i in items: item = CrawlingItem() item._values = i item._values['key'] = response.meta['key'] item._values['collection'] = 'temp_pdd' yield item
def parse(self, response): content = response.text try: data = json.loads(content) except: yield Request(response.url, meta={'key':response.meta['key']}, callback=self.parse) return item = CrawlingItem() item._values = data item._values['key'] = response.meta['key'] yield item
def parse_detail(self, response): content = response.text try: data = json.loads(content) except Exception as e: yield Request(response.url, meta={'key': response.meta['key']}, callback=self.parse_detail, dont_filter=True) return data['key'] = response.meta['key'] item = CrawlingItem() item._values = data item._values['collection'] = 'temp_pdd_sku' yield item
def page_content(self, response): item = CrawlingItem() for i in range(4, 34, 3): t = '//*[@id="container"]/div[2]/div[1]/div[1]/table/tbody/tr[{}]/td/a/span/text()'.format( i) d = '//*[@id="container"]/div[2]/div[1]/div[1]/table/tbody/tr[{}]/td[4]/span/text()'.format( i) try: item["team"] = response.xpath(t)[0].extract() except: item["team"] = None item["date"] = response.xpath(d)[0].extract() yield item
def parse_minister(self, response): person = CrawlingItem() separator = " " main_information = response.css(".MPinfo li") name = separator.join( response.css(".MProwD strong::text").getall()).title() birth_date = "" birth_place = "" profession = "" languages = "" political_force = "" email = "" for info in main_information: if "Дата на раждане" in str(info): birth_date = info.css("::text").get().split()[4] birth_place = separator.join( info.css("::text").get().split()[5:]) if "Професия" in str(info): profession = ", ".join( info.css("::text").get().split()[1].split(";") [:-1]).title() if "Езици" in str(info): languages = ", ".join( info.css("::text").get().split()[1].split(";") [:-1]).title() if "политическа сила" in str(info): political_force = separator.join( info.css("::text").get().split()[4:-1]) if "E-mail" in str(info): email = info.css("::text")[1].get() person["name"] = name person["birth_date"] = birth_date person["birth_place"] = birth_place person["profession"] = profession person["languages"] = languages person["political_force"] = political_force person["email"] = email with open('crawling\schemas.json') as json_file: data = json.load(json_file) jsonschema.validate(dict(person), data) return person
def parse_urlInfo(self, response): item = CrawlingItem() item['source'] = response.xpath( '//*[@id="cSub"]/div[1]/em/a/img/@alt').extract()[0] item['title'] = response.xpath( '//*[@id="cSub"]/div[1]/h3/text()').extract()[0] item['date'] = response.xpath( '/html/head/meta[contains(@property, "og:regDate")]/@content' ).extract()[0][:8] item['contents'] = response.xpath('//*[@id="harmonyContainer"]/section/div[contains(@dmcf-ptype, "general")]/text()').extract()+\ response.xpath('//*[@id="harmonyContainer"]/section/p[contains(@dmcf-ptype, "general")]/text()').extract() # 5초마다 크롤링을 시도 time.sleep(5) yield item
def parse(self, response): self.browser.get(response.url) html = self.browser.find_element_by_xpath('//*').get_attribute( 'outerHTML') self.browser.close() selector = Selector(text=html) for sel in selector.xpath('//table[@id="tableAsset"]/tbody/tr'): item = CrawlingItem() # 정적인 페이지 가져오기 item['coinName'] = sel.xpath( 'td[@class="click left_l"]/text()').extract()[0] # 동적인 페이지 가져오기 item['coinPrice'] = sel.xpath( 'td[@class="right click padding_right50 line_td"]/strong/text()' ).extract()[1] # 날짜 item['date'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') self.conn = mysql.connector.connect( user='******', password='******', host='127.0.0.1', database='wordpress', ) self.cursor = self.conn.cursor() self.cursor.execute( "INSERT INTO cc VALUES(\'%s\', \'%s\', \'%s\')" % (item['coinName'], item['coinPrice'], item['date'])) self.conn.commit() self.cursor.close() self.conn.close() print("==================") yield item print("==================")