def scrap_document_number(graduate): max_retries = 3 logger.info('\tStarting') logger.info('\tQuerying {}'.format(graduate.id)) proxy_server = get_proxy(service='sunedu') if not proxy_server: logger.info('\tError: Out of proxies!') logger.info('\tFinished') return processed = False retries = 0 while not processed: if retries == max_retries: logger.info('\tGiving up!') # Update graduate status record graduate.status = 3 # Error graduate.save() break try: logger.info('\tUsing proxy {}'.format(proxy_server)) driver = get_driver(proxy_server) record = scrap_and_recognize(driver, graduate) if record['id'] != graduate.id: record['status'] = 2 # Invalid # Update graduate fields and create graduate records graduate.status = record['status'] graduate.save() if record['records']: for record in record['records']: GraduateRecord.create( graduate=graduate, name=record['name'], grade=record['grade'], institution=record['institution'], ) processed = True logger.info('\tProcessed') except Exception as e: logger.info('\tError: {}'.format(e)) processed = True finally: try: driver.quit() except Exception: pass logger.info('\tFinished')
def scrap_and_recognize(driver, graduate): driver.set_window_size(1400, 800) driver.get('https://enlinea.sunedu.gob.pe/') # Link to vehicle consulting driver.find_element_by_xpath( '//div[contains(@class, "img_publica")]').click() modal_selector = '#modalConstancia' WebDriverWait(driver, 5).until( EC.visibility_of_element_located((By.CSS_SELECTOR, modal_selector))) modal = driver.find_element_by_css_selector(modal_selector) modal.click() modal.find_elements_by_id('doc')[0].send_keys(graduate.id) ocr = '' solved = False first = True while not solved: captcha_image_selector = '#consultaForm #captchaImg img' WebDriverWait(driver, 5).until( EC.presence_of_element_located( (By.CSS_SELECTOR, captcha_image_selector))) time.sleep(2) captcha_image = driver.find_element_by_css_selector( captcha_image_selector) body = captcha_image.screenshot_as_base64 # Use 2Captha API to solve it in_params = { 'method': 'base64', 'key': app.config['CAPTCHA_KEY'], 'body': body, 'regsense': 1, 'numeric': 4, 'min_len': 5, 'max_len': 5 } in_response = requests.post('https://2captcha.com/in.php', params=in_params) logger.info('\tSolving captcha') if in_response.ok: captcha_id = in_response.text.split('|')[-1] res_params = { 'key': app.config['CAPTCHA_KEY'], 'action': 'get', 'id': captcha_id } time.sleep(5) res_response = requests.get('https://2captcha.com/res.php', params=res_params) if res_response.ok: while res_response.text == 'CAPCHA_NOT_READY': logger.info('\tCaptcha status: {}'.format( res_response.text)) time.sleep(2) res_response = requests.get('https://2captcha.com/res.php', params={ 'key': app.config['CAPTCHA_KEY'], 'action': 'get', 'id': captcha_id }) captcha = res_response.text.split('|')[-1] # Fill the captcha input and submit modal.find_elements_by_id('captcha')[0].send_keys(captcha) modal.find_elements_by_id('buscar')[0].click() time.sleep(2) error_body = modal.find_element_by_id('frmError_Body') if error_body.is_displayed(): error_message = error_body.text.strip() if error_message == 'No se encontraron resultados.': solved = True logger.info('\tCaptcha solved: {}'.format(captcha)) logger.info('\tNo records found'.format(captcha)) else: logger.info('\tIncorrect captcha: {}'.format(captcha)) modal.find_element_by_xpath( '//button[@id = "closeModalError"]/span').click() else: solved = True logger.info('\tCaptcha solved: {}'.format(captcha)) # Search for results trows = modal.find_elements_by_xpath('//tbody[@id = "finalData"]/tr') records = [] for trow in trows[:10]: # Limiting to 10 records tds = trow.find_elements_by_tag_name('td') name = tds[0].text grade = tds[1].text institution = tds[2].text records.append({ 'name': name, 'grade': grade, 'institution': institution, }) return { 'id': graduate.id, 'records': records, 'status': 1, # Processed }
def scrap_and_recognize(driver, rrll): driver.set_window_size(1400, 800) driver.get('https://aplicaciones.claro.com.pe/ClienteLineasWeb/') driver.find_element_by_xpath("//select[@id='iddoc']/option[text()='RUC']").click() driver.find_elements_by_id('numdoc')[0].send_keys(rrll.ruc) driver.find_element_by_xpath("//select[@id='iddoclegal']/option[text()='DNI']").click() driver.find_elements_by_id('numdoclegal')[0].send_keys(rrll.dni) solved = False result_void = False while not solved: captcha_image_selector = '#token' WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, captcha_image_selector))) time.sleep(2) captcha_image = driver.find_element_by_css_selector(captcha_image_selector) body = captcha_image.screenshot_as_base64 # Use 2Captha API to solve it in_params = { 'method': 'base64', 'key': app.config['CAPTCHA_KEY'], 'body': body, 'regsense': 1, 'numeric': 4, 'min_len': 5, 'max_len': 5 } in_response = requests.post('https://2captcha.com/in.php', params=in_params) logger.info('\tSolving captcha') if in_response.ok: captcha_id = in_response.text.split('|')[-1] res_params = { 'key': app.config['CAPTCHA_KEY'], 'action': 'get', 'id': captcha_id } time.sleep(5) res_response = requests.get('https://2captcha.com/res.php', params=res_params) if res_response.ok: while res_response.text == 'CAPCHA_NOT_READY': logger.info('\tCaptcha status: {}'.format(res_response.text)) time.sleep(2) res_response = requests.get('https://2captcha.com/res.php', params={ 'key': app.config['CAPTCHA_KEY'], 'action': 'get', 'id': captcha_id }) captcha = res_response.text.split('|')[-1] # Fill the captcha input and submit logger.info('\tCaptcha solved: {}'.format(captcha)) driver.find_elements_by_id('captcha')[0].send_keys(captcha) driver.find_elements_by_class_name('btn-turquesa')[0].click() time.sleep(3) parent_div = driver.find_element_by_xpath('//div[contains(@class, "box-text-result")]') if parent_div is not None: text_p = parent_div.find_element_by_xpath('//p').text.strip() if text_p == 'El número de documento del Representante Legal no se encuentran asociado al número de RUC.' or text_p == 'No se encontraron resultados para tu búsqueda.': solved = True result_void = True logger.info('\tCaptcha solved but no records found') else: solved = True logger.info('\tScraping solved') else: logger.info('\tIncorrect captcha') if result_void is True: records = [] else: driver.find_element_by_id('showme').click() # Search for results logger.info('\tClick on Showme.') div = driver.find_element_by_xpath('//div[contains(@class, "body-table")]') trows = div.find_elements_by_xpath('//table/tbody/tr') logger.info('\tFinding <tr> on table') records = [] for trow in trows: tds = trow.find_elements_by_tag_name('td') modality = tds[1].text telephone = tds[0].text records.append({ 'modality': modality, 'telephone': telephone }) return { 'id': rrll.id, 'ruc': rrll.ruc, 'dni': rrll.dni, 'records': records, 'status': 1 }
def scrap_claro_lines(rrll): max_retries = 3 logger.info('\tStarting') logger.info('\tQuerying {}'.format(rrll.ruc)) proxy_server = get_proxy(service='claro_line') if not proxy_server: logger.info('\tError: Out of proxies!') logger.info('\tFinished') return processed = False retries = 0 while not processed: if retries == max_retries: logger.info('\tGiving up!') # Update rrll status record rrll.status = 3 # Error rrll.save() break try: logger.info('\tUsing proxy {}'.format(proxy_server)) driver = get_driver(proxy_server) v_json = scrap_and_recognize(driver, rrll) if v_json['ruc'] != rrll.ruc and v_json['dni'] != rrll.dni: v_json['status'] = 2 # Invalid # Update rrll fields and create rrll records rrll.status = v_json['status'] rrll.save() if v_json['records']: for record in v_json['records']: TelephoneLine.create( rrll=rrll, modality=record['modality'], telephone=record['telephone'], ) processed = True logger.info('\tProcessed') except Exception as e: logger.info('\tError: {}'.format(e)) rrll.status = 3 # Error rrll.save() processed = True finally: try: driver.quit() except Exception: pass logger.info('\tFinished')
def scrap_plate_number(vehicle): max_retries = 3 logger.info('\tStarting') logger.info('\tQuerying {}'.format(vehicle.id)) proxy_server = get_proxy(service='sunarp') if not proxy_server: logger.info('\tError: Out of proxies!') logger.info('\tFinished') return processed = False retries = 0 while not processed: if retries == max_retries: logger.info('\tGiving up!') # Update vechicle status record vehicle.status = 3 # Error vehicle.save() break try: logger.info('\tUsing proxy {}'.format(proxy_server)) driver = get_driver(proxy_server) record = scrap_and_recognize(driver, vehicle) if record['plate_number'] != vehicle.id: record['status'] = 2 # Invalid # Update vehicle fields vehicle.plate_number = record['plate_number'] vehicle.serial_number = record['serial_number'] vehicle.vin_number = record['vin_number'] vehicle.engine_number = record['engine_number'] vehicle.color = record['color'] vehicle.make = record['make'] vehicle.model = record['model'] vehicle.valid_plate_number = record['valid_plate_number'] vehicle.previous_plate_number = record['previous_plate_number'] vehicle.state = record['state'] vehicle.notes = record['notes'] vehicle.branch = record['branch'] vehicle.owners = record['owners'] vehicle.image_path = record['image_path'] vehicle.status = record['status'] vehicle.save() processed = True logger.info('\tProcessed') except NoSuchElementException: logger.info('\tError: Element not found') try: label = driver.find_element_by_xpath( '//span[contains(@id, "MainContent_lblWarning")]') if 'número máximo' in label.text: logger.info('\tError: Max queries reached for {}'.format( proxy_server)) # Save this invalid proxy in table and ask for another one Proxy.create(service='sunarp', ip=proxy_server) proxy_server = get_proxy(service='sunarp') if not proxy_server: logger.info('\tError: Out of proxies!') processed = True else: retries += 1 logger.info('\tRetrying...') except NoSuchElementException: logger.info('\tError: Scraping problem') vehicle.status = 3 # Error vehicle.save() processed = True else: retries += 1 logger.info('\tRetrying...') except JavascriptException: logger.info('\tError: Javascript') logger.info('\tRetrying...') retries += 1 except AttributeError: # Probably something wrong with the license image logger.info('\tError: Invalid image') logger.info('\tRetrying...') retries += 1 finally: try: driver.quit() except Exception: pass logger.info('\tFinished')
def scrap_cmp(doctor): max_retries = 3 logger.info('\tStarting') logger.info('\tQuerying {}'.format(doctor.id)) proxy_server = get_proxy(service='cmp') if not proxy_server: logger.info('\tError: Out of proxies!') logger.info('\tFinished') return processed = False retries = 0 while not processed: if retries == max_retries: logger.info('\tGiving up!') # Update doctor status record doctor.status = 3 # Error doctor.save() break try: logger.info('\tUsing proxy {}'.format(proxy_server)) driver = get_driver(proxy_server) record = scrap_and_recognize(driver, doctor) if record['cmp'] != doctor.id: record['status'] = 2 # Invalid # Update doctor fields and create specialties if applicable doctor.name = record['name'] doctor.surname = record['surname'] doctor.state = record['state'] doctor.email = record['email'] doctor.region = record['region'] doctor.notes = record['notes'] doctor.image_path = record['image_path'] doctor.status = record['status'] doctor.save() if record['specialties']: for specialty in record['specialties']: DoctorSpecialty.create( doctor=doctor, name=specialty['name'], type=specialty['type'], code=specialty['code'], end_date=specialty['end_date'], ) processed = True logger.info('\tProcessed') except Exception as e: logger.info('\tError: {}'.format(e)) processed = True finally: try: driver.quit() except Exception: pass logger.info('\tFinished')
def scrap_and_recognize(driver, rrll): url_form = 'https://www.movistar.com.pe/movil/conoce-tus-numeros-moviles' driver.set_window_size(1400, 800) driver.get(url_form) select_ruc = driver.find_element_by_css_selector( "#_consultmobilenumbers_WAR_consultmobilenumbersportlet_documentType") driver.execute_script("arguments[0].style.display = 'block';", select_ruc) driver.find_element_by_xpath( "//select[@id='_consultmobilenumbers_WAR_consultmobilenumbersportlet_documentType']/option[text()=' RUC']" ).click() driver.find_elements_by_id( '_consultmobilenumbers_WAR_consultmobilenumbersportlet_documentNumber' )[0].send_keys(rrll.ruc) select_dni = driver.find_element_by_css_selector( "#_consultmobilenumbers_WAR_consultmobilenumbersportlet_documentTypeRpstative" ) driver.execute_script("arguments[0].style.display = 'block';", select_dni) driver.find_element_by_xpath( "//select[@id='_consultmobilenumbers_WAR_consultmobilenumbersportlet_documentTypeRpstative']/option[text()=' DNI']" ).click() driver.find_elements_by_id( '_consultmobilenumbers_WAR_consultmobilenumbersportlet_documentNumberRpstative' )[0].send_keys(rrll.dni) solved = False result_void = False while not solved: # Use 2Captha API to solve it in_params = { 'method': 'userrecaptcha', 'googlekey': '6LeJaRUUAAAAAA2iQkfdLPkK-vJ2mBqs8j_XA2-t', 'key': app.config['CAPTCHA_KEY'], 'pageurl': url_form, 'json': 1 } in_response = requests.post('https://2captcha.com/in.php', params=in_params) logger.info('\tSolving captcha') if in_response.ok: id_in = json.loads(in_response.text)["request"] logger.info('\tUsing id_in: {}'.format(id_in)) res_params = { 'key': app.config['CAPTCHA_KEY'], 'action': 'get', 'id': id_in, 'json': 1 } time.sleep(5) res_response = requests.get('https://2captcha.com/res.php', params=res_params) if res_response.ok: while json.loads( res_response.text)["request"] == 'CAPCHA_NOT_READY': logger.info('\tCaptcha status: {}'.format( json.loads(res_response.text)["request"])) time.sleep(2) res_response = requests.get('https://2captcha.com/res.php', params={ 'key': app.config['CAPTCHA_KEY'], 'action': 'get', 'id': id_in, 'json': 1 }) resp_res = json.loads(res_response.text)["request"] # Fill the captcha input and submit logger.info('\tCaptcha solved: {}'.format(resp_res)) txt_response = driver.find_element_by_css_selector( "#g-recaptcha-response") driver.execute_script( "arguments[0].innerHTML='{}';".format(resp_res), txt_response) time.sleep(1) driver.find_element_by_id( '_consultmobilenumbers_WAR_consultmobilenumbersportlet_btnSubmit' ).click() time.sleep(3) parent_div = driver.find_element_by_xpath( '//div[contains(@class, "content_result")]') if parent_div is not None: if len(parent_div.find_elements_by_xpath('//center')) == 1: solved = True result_void = True logger.info('\tCaptcha solved but no records found') else: solved = True logger.info('\tScraping solved') else: logger.info('\tIncorrect captcha') if result_void is True: records = [] else: div_size = len( driver.find_elements_by_xpath( '//div[contains(@class, "content_result")]/div')) logger.info('\tHow many divs "content_result": {}'.format(div_size)) if div_size == 1: records = [] scraping_with_pagination(driver, records) elif div_size == 2: records = [] number_of_pages = int( driver.find_element_by_xpath( '//div[@id = "tblData_paginate"]/span/a[last()]').text) logger.info('\tNro Pages: {}'.format(number_of_pages)) for j in range(number_of_pages): scraping_with_pagination(driver, records) if j != range(number_of_pages): logger.info('\tContinue scraping per page') driver.find_element_by_xpath( '//a[@id = "tblData_next"]').click() time.sleep(1) return { 'id': rrll.id, 'ruc': rrll.ruc, 'dni': rrll.dni, 'records': records, 'status': 1 }