예제 #1
0
def scrap_document_number(graduate):
    max_retries = 3
    logger.info('\tStarting')
    logger.info('\tQuerying {}'.format(graduate.id))
    proxy_server = get_proxy(service='sunedu')
    if not proxy_server:
        logger.info('\tError: Out of proxies!')
        logger.info('\tFinished')
        return

    processed = False
    retries = 0
    while not processed:
        if retries == max_retries:
            logger.info('\tGiving up!')
            # Update graduate status record
            graduate.status = 3  # Error
            graduate.save()
            break
        try:
            logger.info('\tUsing proxy {}'.format(proxy_server))
            driver = get_driver(proxy_server)
            record = scrap_and_recognize(driver, graduate)
            if record['id'] != graduate.id:
                record['status'] = 2  # Invalid

            # Update graduate fields and create graduate records
            graduate.status = record['status']
            graduate.save()
            if record['records']:
                for record in record['records']:
                    GraduateRecord.create(
                        graduate=graduate,
                        name=record['name'],
                        grade=record['grade'],
                        institution=record['institution'],
                    )

            processed = True
            logger.info('\tProcessed')
        except Exception as e:
            logger.info('\tError: {}'.format(e))
            processed = True
        finally:
            try:
                driver.quit()
            except Exception:
                pass

    logger.info('\tFinished')
예제 #2
0
def scrap_and_recognize(driver, graduate):
    driver.set_window_size(1400, 800)
    driver.get('https://enlinea.sunedu.gob.pe/')

    # Link to vehicle consulting
    driver.find_element_by_xpath(
        '//div[contains(@class, "img_publica")]').click()

    modal_selector = '#modalConstancia'
    WebDriverWait(driver, 5).until(
        EC.visibility_of_element_located((By.CSS_SELECTOR, modal_selector)))
    modal = driver.find_element_by_css_selector(modal_selector)
    modal.click()
    modal.find_elements_by_id('doc')[0].send_keys(graduate.id)

    ocr = ''
    solved = False
    first = True
    while not solved:
        captcha_image_selector = '#consultaForm #captchaImg img'
        WebDriverWait(driver, 5).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, captcha_image_selector)))
        time.sleep(2)
        captcha_image = driver.find_element_by_css_selector(
            captcha_image_selector)
        body = captcha_image.screenshot_as_base64

        # Use 2Captha API to solve it
        in_params = {
            'method': 'base64',
            'key': app.config['CAPTCHA_KEY'],
            'body': body,
            'regsense': 1,
            'numeric': 4,
            'min_len': 5,
            'max_len': 5
        }
        in_response = requests.post('https://2captcha.com/in.php',
                                    params=in_params)
        logger.info('\tSolving captcha')
        if in_response.ok:
            captcha_id = in_response.text.split('|')[-1]
            res_params = {
                'key': app.config['CAPTCHA_KEY'],
                'action': 'get',
                'id': captcha_id
            }
            time.sleep(5)
            res_response = requests.get('https://2captcha.com/res.php',
                                        params=res_params)
            if res_response.ok:
                while res_response.text == 'CAPCHA_NOT_READY':
                    logger.info('\tCaptcha status: {}'.format(
                        res_response.text))
                    time.sleep(2)
                    res_response = requests.get('https://2captcha.com/res.php',
                                                params={
                                                    'key':
                                                    app.config['CAPTCHA_KEY'],
                                                    'action':
                                                    'get',
                                                    'id':
                                                    captcha_id
                                                })
                captcha = res_response.text.split('|')[-1]

                # Fill the captcha input and submit
                modal.find_elements_by_id('captcha')[0].send_keys(captcha)
                modal.find_elements_by_id('buscar')[0].click()
                time.sleep(2)

                error_body = modal.find_element_by_id('frmError_Body')
                if error_body.is_displayed():
                    error_message = error_body.text.strip()
                    if error_message == 'No se encontraron resultados.':
                        solved = True
                        logger.info('\tCaptcha solved: {}'.format(captcha))
                        logger.info('\tNo records found'.format(captcha))
                    else:
                        logger.info('\tIncorrect captcha: {}'.format(captcha))
                    modal.find_element_by_xpath(
                        '//button[@id = "closeModalError"]/span').click()
                else:
                    solved = True
                    logger.info('\tCaptcha solved: {}'.format(captcha))

    # Search for results
    trows = modal.find_elements_by_xpath('//tbody[@id = "finalData"]/tr')
    records = []
    for trow in trows[:10]:  # Limiting to 10 records
        tds = trow.find_elements_by_tag_name('td')
        name = tds[0].text
        grade = tds[1].text
        institution = tds[2].text
        records.append({
            'name': name,
            'grade': grade,
            'institution': institution,
        })

    return {
        'id': graduate.id,
        'records': records,
        'status': 1,  # Processed
    }
예제 #3
0
def scrap_and_recognize(driver, rrll):
    driver.set_window_size(1400, 800)
    driver.get('https://aplicaciones.claro.com.pe/ClienteLineasWeb/')

    driver.find_element_by_xpath("//select[@id='iddoc']/option[text()='RUC']").click()
    driver.find_elements_by_id('numdoc')[0].send_keys(rrll.ruc)
    driver.find_element_by_xpath("//select[@id='iddoclegal']/option[text()='DNI']").click()
    driver.find_elements_by_id('numdoclegal')[0].send_keys(rrll.dni)

    solved = False
    result_void = False
    while not solved:
        captcha_image_selector = '#token'
        WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, captcha_image_selector)))
        time.sleep(2)
        captcha_image = driver.find_element_by_css_selector(captcha_image_selector)
        body = captcha_image.screenshot_as_base64

        # Use 2Captha API to solve it
        in_params = {
            'method': 'base64',
            'key': app.config['CAPTCHA_KEY'],
            'body': body,
            'regsense': 1,
            'numeric': 4,
            'min_len': 5,
            'max_len': 5
        }
        in_response = requests.post('https://2captcha.com/in.php', params=in_params)
        logger.info('\tSolving captcha')
        if in_response.ok:
            captcha_id = in_response.text.split('|')[-1]
            res_params = {
                'key': app.config['CAPTCHA_KEY'],
                'action': 'get',
                'id': captcha_id
            }
            time.sleep(5)
            res_response = requests.get('https://2captcha.com/res.php', params=res_params)
            if res_response.ok:
                while res_response.text == 'CAPCHA_NOT_READY':
                    logger.info('\tCaptcha status: {}'.format(res_response.text))
                    time.sleep(2)
                    res_response = requests.get('https://2captcha.com/res.php', params={
                        'key': app.config['CAPTCHA_KEY'],
                        'action': 'get',
                        'id': captcha_id
                    })
                captcha = res_response.text.split('|')[-1]

                # Fill the captcha input and submit
                logger.info('\tCaptcha solved: {}'.format(captcha))
                driver.find_elements_by_id('captcha')[0].send_keys(captcha)
                driver.find_elements_by_class_name('btn-turquesa')[0].click()

                time.sleep(3)
                parent_div = driver.find_element_by_xpath('//div[contains(@class, "box-text-result")]')
                if parent_div is not None:
                    text_p = parent_div.find_element_by_xpath('//p').text.strip()
                    if text_p == 'El número de documento del Representante Legal no se encuentran asociado al número de RUC.' or text_p == 'No se encontraron resultados para tu búsqueda.':
                        solved = True
                        result_void = True
                        logger.info('\tCaptcha solved but no records found')
                    else:
                        solved = True
                        logger.info('\tScraping solved')
                else:
                    logger.info('\tIncorrect captcha')

    if result_void is True:
        records = []
    else:
        driver.find_element_by_id('showme').click()
        # Search for results
        logger.info('\tClick on Showme.')
        div = driver.find_element_by_xpath('//div[contains(@class, "body-table")]')
        trows = div.find_elements_by_xpath('//table/tbody/tr')
        logger.info('\tFinding <tr> on table')
        records = []
        for trow in trows:
            tds = trow.find_elements_by_tag_name('td')
            modality = tds[1].text
            telephone = tds[0].text
            records.append({
                'modality': modality,
                'telephone': telephone
            })

    return {
        'id': rrll.id,
        'ruc': rrll.ruc,
        'dni': rrll.dni,
        'records': records,
        'status': 1
    }
예제 #4
0
def scrap_claro_lines(rrll):
    max_retries = 3
    logger.info('\tStarting')
    logger.info('\tQuerying {}'.format(rrll.ruc))
    proxy_server = get_proxy(service='claro_line')
    if not proxy_server:
        logger.info('\tError: Out of proxies!')
        logger.info('\tFinished')
        return

    processed = False
    retries = 0
    while not processed:
        if retries == max_retries:
            logger.info('\tGiving up!')
            # Update rrll status record
            rrll.status = 3 # Error
            rrll.save()
            break
        try:
            logger.info('\tUsing proxy {}'.format(proxy_server))
            driver = get_driver(proxy_server)
            v_json = scrap_and_recognize(driver, rrll)
            if v_json['ruc'] != rrll.ruc and v_json['dni'] != rrll.dni:
                v_json['status'] = 2 # Invalid

            # Update rrll fields and create rrll records
            rrll.status = v_json['status']
            rrll.save()
            if v_json['records']:
                for record in v_json['records']:
                    TelephoneLine.create(
                        rrll=rrll,
                        modality=record['modality'],
                        telephone=record['telephone'],
                    )

            processed = True
            logger.info('\tProcessed')
        except Exception as e:
            logger.info('\tError: {}'.format(e))
            rrll.status = 3 # Error
            rrll.save()
            processed = True
        finally:
            try:
                driver.quit()
            except Exception:
                pass

    logger.info('\tFinished')
예제 #5
0
def scrap_plate_number(vehicle):
    max_retries = 3
    logger.info('\tStarting')
    logger.info('\tQuerying {}'.format(vehicle.id))
    proxy_server = get_proxy(service='sunarp')
    if not proxy_server:
        logger.info('\tError: Out of proxies!')
        logger.info('\tFinished')
        return

    processed = False
    retries = 0
    while not processed:
        if retries == max_retries:
            logger.info('\tGiving up!')
            # Update vechicle status record
            vehicle.status = 3  # Error
            vehicle.save()
            break
        try:
            logger.info('\tUsing proxy {}'.format(proxy_server))
            driver = get_driver(proxy_server)
            record = scrap_and_recognize(driver, vehicle)
            if record['plate_number'] != vehicle.id:
                record['status'] = 2  # Invalid

            # Update vehicle fields
            vehicle.plate_number = record['plate_number']
            vehicle.serial_number = record['serial_number']
            vehicle.vin_number = record['vin_number']
            vehicle.engine_number = record['engine_number']
            vehicle.color = record['color']
            vehicle.make = record['make']
            vehicle.model = record['model']
            vehicle.valid_plate_number = record['valid_plate_number']
            vehicle.previous_plate_number = record['previous_plate_number']
            vehicle.state = record['state']
            vehicle.notes = record['notes']
            vehicle.branch = record['branch']
            vehicle.owners = record['owners']
            vehicle.image_path = record['image_path']
            vehicle.status = record['status']
            vehicle.save()

            processed = True
            logger.info('\tProcessed')
        except NoSuchElementException:
            logger.info('\tError: Element not found')
            try:
                label = driver.find_element_by_xpath(
                    '//span[contains(@id, "MainContent_lblWarning")]')
                if 'número máximo' in label.text:
                    logger.info('\tError: Max queries reached for {}'.format(
                        proxy_server))
                    # Save this invalid proxy in table and ask for another one
                    Proxy.create(service='sunarp', ip=proxy_server)
                    proxy_server = get_proxy(service='sunarp')
                    if not proxy_server:
                        logger.info('\tError: Out of proxies!')
                        processed = True
                    else:
                        retries += 1
                        logger.info('\tRetrying...')
            except NoSuchElementException:
                logger.info('\tError: Scraping problem')
                vehicle.status = 3  # Error
                vehicle.save()
                processed = True
            else:
                retries += 1
                logger.info('\tRetrying...')
        except JavascriptException:
            logger.info('\tError: Javascript')
            logger.info('\tRetrying...')
            retries += 1
        except AttributeError:
            # Probably something wrong with the license image
            logger.info('\tError: Invalid image')
            logger.info('\tRetrying...')
            retries += 1
        finally:
            try:
                driver.quit()
            except Exception:
                pass

    logger.info('\tFinished')
예제 #6
0
def scrap_cmp(doctor):
    max_retries = 3
    logger.info('\tStarting')
    logger.info('\tQuerying {}'.format(doctor.id))
    proxy_server = get_proxy(service='cmp')
    if not proxy_server:
        logger.info('\tError: Out of proxies!')
        logger.info('\tFinished')
        return

    processed = False
    retries = 0
    while not processed:
        if retries == max_retries:
            logger.info('\tGiving up!')
            # Update doctor status record
            doctor.status = 3  # Error
            doctor.save()
            break
        try:
            logger.info('\tUsing proxy {}'.format(proxy_server))
            driver = get_driver(proxy_server)
            record = scrap_and_recognize(driver, doctor)
            if record['cmp'] != doctor.id:
                record['status'] = 2  # Invalid

            # Update doctor fields and create specialties if applicable
            doctor.name = record['name']
            doctor.surname = record['surname']
            doctor.state = record['state']
            doctor.email = record['email']
            doctor.region = record['region']
            doctor.notes = record['notes']
            doctor.image_path = record['image_path']
            doctor.status = record['status']
            doctor.save()
            if record['specialties']:
                for specialty in record['specialties']:
                    DoctorSpecialty.create(
                        doctor=doctor,
                        name=specialty['name'],
                        type=specialty['type'],
                        code=specialty['code'],
                        end_date=specialty['end_date'],
                    )

            processed = True
            logger.info('\tProcessed')
        except Exception as e:
            logger.info('\tError: {}'.format(e))
            processed = True
        finally:
            try:
                driver.quit()
            except Exception:
                pass

    logger.info('\tFinished')
예제 #7
0
def scrap_and_recognize(driver, rrll):
    url_form = 'https://www.movistar.com.pe/movil/conoce-tus-numeros-moviles'
    driver.set_window_size(1400, 800)
    driver.get(url_form)

    select_ruc = driver.find_element_by_css_selector(
        "#_consultmobilenumbers_WAR_consultmobilenumbersportlet_documentType")
    driver.execute_script("arguments[0].style.display = 'block';", select_ruc)
    driver.find_element_by_xpath(
        "//select[@id='_consultmobilenumbers_WAR_consultmobilenumbersportlet_documentType']/option[text()=' RUC']"
    ).click()
    driver.find_elements_by_id(
        '_consultmobilenumbers_WAR_consultmobilenumbersportlet_documentNumber'
    )[0].send_keys(rrll.ruc)
    select_dni = driver.find_element_by_css_selector(
        "#_consultmobilenumbers_WAR_consultmobilenumbersportlet_documentTypeRpstative"
    )
    driver.execute_script("arguments[0].style.display = 'block';", select_dni)
    driver.find_element_by_xpath(
        "//select[@id='_consultmobilenumbers_WAR_consultmobilenumbersportlet_documentTypeRpstative']/option[text()=' DNI']"
    ).click()
    driver.find_elements_by_id(
        '_consultmobilenumbers_WAR_consultmobilenumbersportlet_documentNumberRpstative'
    )[0].send_keys(rrll.dni)

    solved = False
    result_void = False
    while not solved:
        # Use 2Captha API to solve it
        in_params = {
            'method': 'userrecaptcha',
            'googlekey': '6LeJaRUUAAAAAA2iQkfdLPkK-vJ2mBqs8j_XA2-t',
            'key': app.config['CAPTCHA_KEY'],
            'pageurl': url_form,
            'json': 1
        }
        in_response = requests.post('https://2captcha.com/in.php',
                                    params=in_params)
        logger.info('\tSolving captcha')
        if in_response.ok:
            id_in = json.loads(in_response.text)["request"]
            logger.info('\tUsing id_in: {}'.format(id_in))
            res_params = {
                'key': app.config['CAPTCHA_KEY'],
                'action': 'get',
                'id': id_in,
                'json': 1
            }
            time.sleep(5)
            res_response = requests.get('https://2captcha.com/res.php',
                                        params=res_params)
            if res_response.ok:
                while json.loads(
                        res_response.text)["request"] == 'CAPCHA_NOT_READY':
                    logger.info('\tCaptcha status: {}'.format(
                        json.loads(res_response.text)["request"]))
                    time.sleep(2)
                    res_response = requests.get('https://2captcha.com/res.php',
                                                params={
                                                    'key':
                                                    app.config['CAPTCHA_KEY'],
                                                    'action':
                                                    'get',
                                                    'id':
                                                    id_in,
                                                    'json':
                                                    1
                                                })
                resp_res = json.loads(res_response.text)["request"]

                # Fill the captcha input and submit
                logger.info('\tCaptcha solved: {}'.format(resp_res))
                txt_response = driver.find_element_by_css_selector(
                    "#g-recaptcha-response")
                driver.execute_script(
                    "arguments[0].innerHTML='{}';".format(resp_res),
                    txt_response)
                time.sleep(1)
                driver.find_element_by_id(
                    '_consultmobilenumbers_WAR_consultmobilenumbersportlet_btnSubmit'
                ).click()

                time.sleep(3)
                parent_div = driver.find_element_by_xpath(
                    '//div[contains(@class, "content_result")]')
                if parent_div is not None:
                    if len(parent_div.find_elements_by_xpath('//center')) == 1:
                        solved = True
                        result_void = True
                        logger.info('\tCaptcha solved but no records found')
                    else:
                        solved = True
                        logger.info('\tScraping solved')
                else:
                    logger.info('\tIncorrect captcha')

    if result_void is True:
        records = []
    else:
        div_size = len(
            driver.find_elements_by_xpath(
                '//div[contains(@class, "content_result")]/div'))
        logger.info('\tHow many divs "content_result": {}'.format(div_size))
        if div_size == 1:
            records = []
            scraping_with_pagination(driver, records)

        elif div_size == 2:
            records = []
            number_of_pages = int(
                driver.find_element_by_xpath(
                    '//div[@id = "tblData_paginate"]/span/a[last()]').text)
            logger.info('\tNro Pages: {}'.format(number_of_pages))

            for j in range(number_of_pages):
                scraping_with_pagination(driver, records)
                if j != range(number_of_pages):
                    logger.info('\tContinue scraping per page')
                    driver.find_element_by_xpath(
                        '//a[@id = "tblData_next"]').click()
                    time.sleep(1)

    return {
        'id': rrll.id,
        'ruc': rrll.ruc,
        'dni': rrll.dni,
        'records': records,
        'status': 1
    }