def get_token(self): """ Generate the token that will be used for case detail api This function returns an object. """ # get site_key for google recaptcha try: site_key = self.GLOBAL_SESSION.get(self.SITE_KEY_URL).text.replace( '"', '').strip() except requests.ConnectionError as e: print("Connection failure : " + str(e)) print("Verification with InsightFinder credentials Failed") return {'error': str(e)} print(site_key) # get captcha_id with captcha_key, site_key and page_url on the website that has CAPTCHAs recaptcha_answer = get_recaptcha_answer(site_key, self.SEARCH_URL) print(recaptcha_answer) # get case_token with recaptcha_answer self.GLOBAL_SESSION = InitializedSession( headers={'recaptcha': recaptcha_answer}) try: r = self.GLOBAL_SESSION.get(self.TOKEN_URL) except requests.ConnectionError as e: print("Connection failure : " + str(e)) print("Verification with InsightFinder credentials Failed") return {'error': str(e)} if 'token' in json.loads(r.text): return {'token': json.loads(r.text)['token']} else: return {'token': ''}
def get_cookie(self, input_string): """ Get cookie from Search Result Page by using firstName and lastName. input_string(firstName+lastName) will be entered to Search Input in Search Page of the website automatically on Selenium chromedriver. To submit the Search Form with this input, we should pass the google recaptcha with sitekey. In this website, they don't send the input string like firstName and lastName with form_data or parameters. They get that information from cookie that is returned by server.(when we click submit button) This function returns an object. """ driver = webdriver.Chrome('./chromedriver.exe') # driver = webdriver.Chrome(ChromeDriverManager().install()) driver.get(self.HOME_URL) # fill up the search input with input_string search_form = driver.find_element_by_css_selector( '#SearchCriteriaContainer input.form-control') driver.execute_script( """arguments[0].value = '{}'""".format(input_string), search_form) # get site_key for google recaptcha site_key = driver.find_element_by_class_name( 'g-recaptcha').get_attribute('data-sitekey') # get recaptcha_answer with 2captcha service recaptcha_answer = get_recaptcha_answer(site_key, self.HOME_URL) print(recaptcha_answer) # fill up the recaptcha_answer to recaptcha_response textarea to overcome the recaptcha recaptcha_response = driver.find_element_by_class_name( 'g-recaptcha-response') driver.execute_script( """arguments[0].innerHTML = '{}'""".format(recaptcha_answer), recaptcha_response) # go to the search results page by clicking submit button submit_button = driver.find_element_by_css_selector('#btnSSSubmit') submit_button.click() # build the cookie list cookies_list = driver.get_cookies() cookies = {} for cookie in cookies_list: cookies[cookie['name']] = cookie['value'] print(cookies) return cookies
def get_case_detail(self, case_number, page_number, last_name, first_name): """ Get every information of case detail with given case number and page number This function returns an object. """ recaptcha_answer = get_recaptcha_answer( self.SITE_KEY, self.CASE_DETAIL_URL) print(recaptcha_answer) LUA_SCRIPT = ''' function main(splash) assert(splash:autoload("https://code.jquery.com/jquery-2.1.3.min.js")) treat = require("treat") local url = splash.args.url local case_number = splash.args.case_number local page = splash.args.page local captcha_response = splash.args.captcha_response assert(splash:go(url)) assert(splash:wait(5)) local form = splash:select('#frmDefault') local values = assert(form:form_values()) values.txtDKTNAME = splash.args.name assert(form:fill(values)) local element = splash:select('#btnGO') local bounds = element:bounds() assert(element:mouse_click{x=bounds.width/3, y=bounds.height/3}) assert(splash:wait(1)) local page_links = splash:select_all('tr.mypager:nth-child(1) td') local cases = {} local case_count = 0 if page > 1 then local element = splash:select('tr.mypager:nth-child(1) td:nth-child('..page..') a') if element then local bounds = element:bounds() assert(element:mouse_click{x=bounds.width/3, y=bounds.height/3}) assert(splash:wait(1)) end end local rows = splash:select_all('table#gvDocket > tbody > tr') for j, row in ipairs(rows) do local case_number_element = splash:select('table#gvDocket > tbody > tr:nth-child('..j..') > td:nth-child(2) span') if case_number_element then if case_number_element:text() == case_number then local view_button = splash:select('table#gvDocket > tbody > tr:nth-child('..j..') > td:nth-child(1) input') local bounds = view_button:bounds() assert(view_button:mouse_click{x=bounds.width/3, y=bounds.height/3}) assert(splash:wait(1)) end end end local sitekey = splash:select('div.g-recaptcha'):getAttribute('data-sitekey') assert(splash:wait(1)) local form1 = splash:select('#Form1') local values1 = form1:form_values() values1['g-recaptcha-response'] = captcha_response assert(form1:fill(values1)) assert(form1:submit()) assert(splash:wait(1)) return { url = splash:url(), html = splash:html(), sitekey=sitekey, values=values1 } end ''' try: r = self.GLOBAL_SESSION.post(SPLASH_URL, auth=(SPLASH_USERNAME, SPLASH_PASSWORD), json={'url': self.SEARCH_URL, 'lua_source': LUA_SCRIPT, 'case_number': case_number, 'page': page_number + 1, 'captcha_response': recaptcha_answer, 'name': last_name + '/' + first_name }) if 'html' in json.loads(r.text): return self.parse_case_detail(BeautifulSoup(json.loads(r.text)['html'], features="html.parser")) else: return {} except requests.ConnectionError as e: print("Connection failure : " + str(e)) print("Verification with InsightFinder credentials Failed") return {'error': str(e)}
def search_in_orange_fl(self, first_name, last_name, dob): """ Scrape the web site using the given search criteria. This function either returns an object with a field called "result" which is an array of cases, or an object with a field called "error" with a error string e.g. { "result": [...] } or { "error": "..." } """ first_name = NameNormalizer(first_name).normalized() last_name = NameNormalizer(last_name).normalized() if dob: dob = dob.strip() self.FIRST_NAME = first_name self.LAST_NAME = last_name self.DOB = dob try: r = self.GLOBAL_SESSION.get(self.SEARCH_URL) soup = BeautifulSoup(r.text, features="html.parser") if soup.find('form', class_='form-horizontal'): self.SITE_KEY = soup.find( 'div', class_='g-recaptcha').attrs['data-sitekey'] captcha_response = get_recaptcha_answer( self.SITE_KEY, self.SEARCH_URL) else: return {'error': 'Server Error'} LUA_SCRIPT = ''' function main(splash) assert(splash:autoload("https://code.jquery.com/jquery-2.1.3.min.js")) treat = require("treat") assert(splash:go(splash.args.url)) assert(splash:wait(2)) assert(splash:runjs('$("input[name=FirstName]").val("'..splash.args.first_name..'")')) assert(splash:runjs('$("input[name=LastName]").val("'..splash.args.last_name..'")')) assert(splash:runjs('$("textarea[name=g-recaptcha-response]").val("'..splash.args.captcha_response..'")')) assert(splash:wait(1)) local form = splash:select('.form-horizontal') local values = assert(form:form_values()) assert(form:submit()) assert(splash:wait(5)) local search_input = splash:select('input[type=search]') search_input:send_text(splash.args.dob) assert(splash:wait(3)) return { url = splash:url(), html = splash:html(), values = values } end ''' r = self.GLOBAL_SESSION.post(SPLASH_URL, auth=(SPLASH_USERNAME, SPLASH_PASSWORD), json={ 'url': self.SEARCH_URL, 'lua_source': LUA_SCRIPT, 'first_name': self.FIRST_NAME, 'last_name': self.LAST_NAME, 'dob': self.DOB, 'captcha_response': captcha_response }) print(r.text) if 'html' not in json.loads(r.text): return {'error': "Internal Server Error"} cases = self.parse_search_results( BeautifulSoup(json.loads(r.text)['html'], features="html.parser")) for case in cases: case['case_detail'] = self.get_case_detail(case['case_number']) except requests.ConnectionError as e: print("Connection failure : " + str(e)) print("Verification with InsightFinder credentials Failed") return {'error': str(e)} return {'result': cases}
def get_case_detail(self, case_number): """ Get every information of case detail with given case number This function returns an object. """ try: captcha_response = get_recaptcha_answer(self.SITE_KEY, self.SEARCH_URL) LUA_SCRIPT = ''' function main(splash) assert(splash:autoload("https://code.jquery.com/jquery-2.1.3.min.js")) treat = require("treat") assert(splash:go(splash.args.url)) assert(splash:wait(2)) assert(splash:runjs('$("input[name=FirstName]").val("'..splash.args.first_name..'")')) assert(splash:runjs('$("input[name=LastName]").val("'..splash.args.last_name..'")')) assert(splash:runjs('$("textarea[name=g-recaptcha-response]").val("'..splash.args.captcha_response..'")')) assert(splash:wait(1)) local form = splash:select('.form-horizontal') local values = assert(form:form_values()) assert(form:submit()) assert(splash:wait(5)) local search_input = splash:select('input[type=search]') search_input:send_text(splash.args.dob) assert(splash:wait(3)) local rows = splash:select_all('table#caseList tbody tr') local case_number = '' for j, row in ipairs(rows) do local case_number_element = splash:select('table#caseList tbody tr:nth-child('..j..') a.caseLink') if case_number_element then if case_number_element:text() == splash.args.case_number then case_number = case_number_element:text() local bounds = case_number_element:bounds() assert(case_number_element:mouse_click{x=bounds.width/3, y=bounds.height/3}) assert(splash:wait(5)) end end end return { url = splash:url(), html = splash:html(), case_number = case_number } end ''' r = self.GLOBAL_SESSION.post(SPLASH_URL, auth=(SPLASH_USERNAME, SPLASH_PASSWORD), json={ 'url': self.SEARCH_URL, 'lua_source': LUA_SCRIPT, 'first_name': self.FIRST_NAME, 'last_name': self.LAST_NAME, 'dob': self.DOB, 'captcha_response': captcha_response, 'case_number': case_number }) print(case_number) if 'html' in json.loads(r.text): print(json.loads(r.text)['html']) print(json.loads(r.text)['case_number']) return self.parse_case_detail( BeautifulSoup(json.loads(r.text)['html'], features="html.parser")) else: print(r.text) return {} except requests.ConnectionError as e: print("Connection failure : " + str(e)) print("Verification with InsightFinder credentials Failed") return {}