Exemplo n.º 1
0
 def get_domain_name(self, response):
     request_state = self.if_too_many_request(response.body, 'name')
     item = response.meta['item']
     if (request_state == False):
         domain = response.xpath("//head/title/text()").extract()[0]
         domain = domain.encode('utf8')[:-105]
         item['domain'] = domain
         yield item
     else:
         url = response.request.url
         cookie = get_cookie()
         yield scrapy.Request(
             url,
             headers=self.head,
             meta={
                 'cookie': cookie,
                 'item': item
             },
             cookies={
                 "__cfduid": cookie[1],
                 "cf_clearance": cookie[2],
                 "BenmiUserInfo2": "Benmi-UN=hahaha321",
                 "SITEINFO":
                 "66b/UN0Nvf1MujwHhivXoluFewMFC48CdOZ9YpNXKEg=; "
             },
             callback=self.get_domain_name,
             dont_filter=True)
Exemplo n.º 2
0
 def start_requests(self):
     registrant = self.get_registrant()
     #        registrant = '%2B86.18197565656'
     if (registrant != None):
         cookie = get_cookie()
         # url = "https://www.benmi.com/whoishistory/" + domain + ".html"
         url = "https://www.benmi.com/rwhois?q=" + registrant + "&t=tel"
         return [
             scrapy.Request(
                 url,
                 headers=self.head,
                 meta={
                     'registrant': registrant,
                     'cookie': cookie
                 },
                 cookies={
                     "__cfduid":
                     cookie[1],
                     "cf_clearance":
                     cookie[2],
                     "BenmiUserInfo2":
                     "Benmi-UN=hahaha321",
                     "SITEINFO":
                     "66b/UN0Nvf1MujwHhivXoluFewMFC48CdOZ9YpNXKEg=; "
                 })
         ]
Exemplo n.º 3
0
 def get_first_page(self, response):
     request_state = self.if_too_many_request(response.body, 'first_page')
     registrant = response.meta['registrant']
     if (request_state == False):
         s = Selector(text=response.body)
         content = u'//table[@class="sf-grid" and @id = "sf-grid"]/tr/td[@class = "lf"]/a/img[@alt="..."]/../@href'
         domain_url_list = s.xpath(content).extract()
         content2 = u'//table[@class="sf-grid" and @id = "sf-grid"]/tr'
         s_list = s.xpath(content2)
         domain_url_list2 = []
         for s in s_list:
             url2 = s.xpath('td[@class = "lf"]/a/img[@alt="..."]/../@href'
                            ).extract()[0]
             domain_url_list2.append(url2)
         for url in domain_url_list2:
             cookie = get_cookie()
             url = "https://www.benmi.com" + url
             item = RwhoisRegistrantItem()
             item['registrant'] = registrant
             yield scrapy.Request(
                 url,
                 headers=self.head,
                 meta={
                     'cookie': cookie,
                     'item': item
                 },
                 cookies={
                     "__cfduid":
                     cookie[1],
                     "cf_clearance":
                     cookie[2],
                     "BenmiUserInfo2":
                     "Benmi-UN=hahaha321",
                     "SITEINFO":
                     "66b/UN0Nvf1MujwHhivXoluFewMFC48CdOZ9YpNXKEg=; "
                 },
                 callback=self.get_domain_name,
                 dont_filter=True)
Exemplo n.º 4
0
 def parse(self, response):
     request_state = self.if_too_many_request(response.body, 'parse')
     registrant = response.meta['registrant']
     if (request_state == False):
         all_page_num = self.get_page_num(response.body)
         num = 1
         while num <= all_page_num:
             url = "https://www.benmi.com/rwhois?p=" + \
                   str(num) + "&q=" + registrant + "&t=tel"
             print url
             cookie = get_cookie()
             num = num + 1
             yield scrapy.Request(
                 url,
                 headers=self.head,
                 meta={
                     'cookie': cookie,
                     'registrant': registrant
                 },
                 cookies={
                     "__cfduid":
                     cookie[1],
                     "cf_clearance":
                     cookie[2],
                     "BenmiUserInfo2":
                     "Benmi-UN=hahaha321",
                     "SITEINFO":
                     "66b/UN0Nvf1MujwHhivXoluFewMFC48CdOZ9YpNXKEg=; "
                 },
                 callback=self.get_first_page,
                 dont_filter=True)
         self.finish_registrant(registrant)
         registrant = self.get_registrant()
         if (registrant != None):
             cookie = get_cookie()
             # url = "https://www.benmi.com/whoishistory/" + domain + ".html"
             url = "https://www.benmi.com/rwhois?q=" + registrant + "&t=tel"
             yield scrapy.Request(
                 url,
                 headers=self.head,
                 meta={
                     'registrant': registrant,
                     'cookie': cookie
                 },
                 cookies={
                     "__cfduid":
                     cookie[1],
                     "cf_clearance":
                     cookie[2],
                     "BenmiUserInfo2":
                     "Benmi-UN=hahaha321",
                     "SITEINFO":
                     "66b/UN0Nvf1MujwHhivXoluFewMFC48CdOZ9YpNXKEg=; "
                 },
                 callback=self.parse,
                 dont_filter=True)
     else:
         url = response.request.url
         cookie = get_cookie()
         yield scrapy.Request(
             url,
             headers=self.head,
             meta={
                 'registrant': registrant,
                 'cookie': cookie
             },
             cookies={
                 "__cfduid": cookie[1],
                 "cf_clearance": cookie[2],
                 "BenmiUserInfo2": "Benmi-UN=hahaha321",
                 "SITEINFO":
                 "66b/UN0Nvf1MujwHhivXoluFewMFC48CdOZ9YpNXKEg=; "
             },
             callback=self.parse,
             dont_filter=True)