def get_domain_name(self, response): request_state = self.if_too_many_request(response.body, 'name') item = response.meta['item'] if (request_state == False): domain = response.xpath("//head/title/text()").extract()[0] domain = domain.encode('utf8')[:-105] item['domain'] = domain yield item else: url = response.request.url cookie = get_cookie() yield scrapy.Request( url, headers=self.head, meta={ 'cookie': cookie, 'item': item }, cookies={ "__cfduid": cookie[1], "cf_clearance": cookie[2], "BenmiUserInfo2": "Benmi-UN=hahaha321", "SITEINFO": "66b/UN0Nvf1MujwHhivXoluFewMFC48CdOZ9YpNXKEg=; " }, callback=self.get_domain_name, dont_filter=True)
def start_requests(self): registrant = self.get_registrant() # registrant = '%2B86.18197565656' if (registrant != None): cookie = get_cookie() # url = "https://www.benmi.com/whoishistory/" + domain + ".html" url = "https://www.benmi.com/rwhois?q=" + registrant + "&t=tel" return [ scrapy.Request( url, headers=self.head, meta={ 'registrant': registrant, 'cookie': cookie }, cookies={ "__cfduid": cookie[1], "cf_clearance": cookie[2], "BenmiUserInfo2": "Benmi-UN=hahaha321", "SITEINFO": "66b/UN0Nvf1MujwHhivXoluFewMFC48CdOZ9YpNXKEg=; " }) ]
def get_first_page(self, response): request_state = self.if_too_many_request(response.body, 'first_page') registrant = response.meta['registrant'] if (request_state == False): s = Selector(text=response.body) content = u'//table[@class="sf-grid" and @id = "sf-grid"]/tr/td[@class = "lf"]/a/img[@alt="..."]/../@href' domain_url_list = s.xpath(content).extract() content2 = u'//table[@class="sf-grid" and @id = "sf-grid"]/tr' s_list = s.xpath(content2) domain_url_list2 = [] for s in s_list: url2 = s.xpath('td[@class = "lf"]/a/img[@alt="..."]/../@href' ).extract()[0] domain_url_list2.append(url2) for url in domain_url_list2: cookie = get_cookie() url = "https://www.benmi.com" + url item = RwhoisRegistrantItem() item['registrant'] = registrant yield scrapy.Request( url, headers=self.head, meta={ 'cookie': cookie, 'item': item }, cookies={ "__cfduid": cookie[1], "cf_clearance": cookie[2], "BenmiUserInfo2": "Benmi-UN=hahaha321", "SITEINFO": "66b/UN0Nvf1MujwHhivXoluFewMFC48CdOZ9YpNXKEg=; " }, callback=self.get_domain_name, dont_filter=True)
def parse(self, response): request_state = self.if_too_many_request(response.body, 'parse') registrant = response.meta['registrant'] if (request_state == False): all_page_num = self.get_page_num(response.body) num = 1 while num <= all_page_num: url = "https://www.benmi.com/rwhois?p=" + \ str(num) + "&q=" + registrant + "&t=tel" print url cookie = get_cookie() num = num + 1 yield scrapy.Request( url, headers=self.head, meta={ 'cookie': cookie, 'registrant': registrant }, cookies={ "__cfduid": cookie[1], "cf_clearance": cookie[2], "BenmiUserInfo2": "Benmi-UN=hahaha321", "SITEINFO": "66b/UN0Nvf1MujwHhivXoluFewMFC48CdOZ9YpNXKEg=; " }, callback=self.get_first_page, dont_filter=True) self.finish_registrant(registrant) registrant = self.get_registrant() if (registrant != None): cookie = get_cookie() # url = "https://www.benmi.com/whoishistory/" + domain + ".html" url = "https://www.benmi.com/rwhois?q=" + registrant + "&t=tel" yield scrapy.Request( url, headers=self.head, meta={ 'registrant': registrant, 'cookie': cookie }, cookies={ "__cfduid": cookie[1], "cf_clearance": cookie[2], "BenmiUserInfo2": "Benmi-UN=hahaha321", "SITEINFO": "66b/UN0Nvf1MujwHhivXoluFewMFC48CdOZ9YpNXKEg=; " }, callback=self.parse, dont_filter=True) else: url = response.request.url cookie = get_cookie() yield scrapy.Request( url, headers=self.head, meta={ 'registrant': registrant, 'cookie': cookie }, cookies={ "__cfduid": cookie[1], "cf_clearance": cookie[2], "BenmiUserInfo2": "Benmi-UN=hahaha321", "SITEINFO": "66b/UN0Nvf1MujwHhivXoluFewMFC48CdOZ9YpNXKEg=; " }, callback=self.parse, dont_filter=True)