예제 #1
0
class Client:
    def __init__(self, ig_id):
        self.b = PhantomJS()
        self.ig_id = ig_id
        self.b.get('https://instagram.com/%s' % ig_id)

    def close(self):
        self.b.close()

    def get_media(self) -> list:
        js = self.b.execute_script('return window._sharedData;')
        ed = js['entry_data']
        pp = ed['PostPage'][0]
        g = pp['graphql']
        sc = g['shortcode_media']
        if sc['__typename'] == 'GraphSidecar':
            edges = sc['edge_sidecar_to_children']['edges']
            medias = list(
                map(
                    lambda x: {
                        'id': x['node']['id'],
                        'url': x['node']['display_url'],
                        'caption': x['node']['accessibility_caption']
                    }, edges))
        elif sc['__typename'] == 'GraphImage':
            medias = [{
                'id': sc['id'],
                'url': sc['display_url'],
                'caption': sc['accessibility_caption']
            }]
        return list(
            filter(
                lambda x: 'person' in x['caption'] or 'people' in x['caption'],
                medias))

    def get_user(self) -> dict:
        js = self.b.execute_script('return window._sharedData;')
        ed = js['entry_data']
        pp = ed['ProfilePage'][0]
        g = pp['graphql']
        return g['user']

    def get_posts(self) -> set:
        ps = self.b.find_elements_by_css_selector('a[href^="/p/"]')
        return set(map(lambda x: x.get_attribute('href'), ps))

    def scroll(self):
        self.b.execute_script('window.scroll(0, document.body.scrollHeight);')
예제 #2
0
def generate_image(structure):
    image_path = os.path.join(mkdtemp(), 'okc.png')
    html_path = os.path.join(
        os.path.dirname(os.path.realpath(__file__)),
        'okc.html',
    )
    url = 'file://{}'.format(html_path)
    driver = PhantomJS(service_log_path=mkstemp()[1])
    driver.set_window_size(2000, 500)
    driver.get(url)
    driver.execute_script('setText({});'.format(json.dumps(structure)))

    if random() > 0.4:
        driver.execute_script('hideForm();')
    elif random() > 0.5:
        driver.execute_script('uncheckForm();')

    driver.set_window_size(*driver.execute_script('return getSize();'))
    driver.save_screenshot(image_path)

    # twitter's gonna make our beautiful screenshot a jpeg unless we make it
    # think that we're using transparency for a reason, so,,
    img = Image.open(image_path)
    origin = img.getpixel((0, 0))
    new_origin = origin[:3] + (254,)
    img.putpixel((0, 0), new_origin)
    img.save(image_path)

    subprocess.check_call(['optipng', '-quiet', image_path])

    return image_path
예제 #3
0
class gmail(Thread):
    def __init__(self, account):
        name = account['name']

        super().__init__(name=name)  # Thread __init__

        lg.warning('{0[name]}, proxy: {0[Proxy]}'.format(account))

        self.account = account
        self.solved = 0

        if 0:  # Getting cookies snippet
            print(self.driver.get_cookies())
            cookies = {
                _['name']: _['value']
                for _ in self.driver.get_cookies()
            }
            with open('cookies.json', 'w') as f:
                dump(cookies, f, indent=4)

    def verify(self, el):
        '''Verifies the account. May be untrivial:('''

        text = el.text  # get_attribute('value')
        lg.info('Text: {}'.format(text))
        if text == "Verify it's you":
            lg.debug('Verify')
            #el=self.driver.find_element_by_id('identifierNext')
            el = self.driver.find_element_by_xpath(
                '//div[.="Confirm your recovery email"]')
            print(el)
            el.click()
            el = WebDriverWait(self.driver, 3).until(
                EC.visibility_of_element_located(
                    (By.NAME, 'knowledgePreregisteredEmailResponse')))
            el.send_keys(account[2])  # recovery email

    def login(self):
        if 0:  # to test
            #'https://www.whoishostingthis.com/tools/user-agent/'
            self.driver.get('about:about')
            sleep(1000)
        #self.driver.get('https://mail.google.com')
        self.driver.get(
            'https://accounts.google.com/signin/v2/identifier?continue=https%3A%2F%2Fmail.google.com%2Fmail%2F&service=mail&sacu=1&rip=1&flowName=GlifWebSignIn&flowEntry=ServiceLogin'
        )
        prefilled = False

        lg.debug('Logging in with {}'.format(self.account))
        try:
            el = WebDriverWait(self.driver, 2).until(
                EC.visibility_of_element_located((By.ID, 'identifierId')))
        except TimeoutException:
            prefilled = True

        if prefilled:
            lg.info('Username prefilled already')
        else:
            lg.debug('Entering username')
            el.send_keys(self.account['name'])  # username
            nxt = self.driver.find_element_by_id('identifierNext')
            nxt.click()

        logged_in = False
        try:
            el = WebDriverWait(self.driver, 20).until(
                EC.visibility_of_element_located((By.NAME, 'password')))
        except TimeoutException:  # We're logged in?
            # TODO: Check for something visible after being logged in
            # Because we may genuinely be in timeout
            logged_in = True

        if logged_in:
            lg.info('Logged in already')
        else:
            lg.debug('Entering password')
            el.send_keys(self.account['Second Password'])
            nxt = WebDriverWait(self.driver, 5).until(
                EC.element_to_be_clickable((By.ID, 'passwordNext')))
            nxt.click()

            # WebDriverWait(self.driver, 60).until(
            #     EC.frame_to_be_available_and_switch_to_it((By.ID, 'tab1_1'))
            # )

            try:
                el = WebDriverWait(self.driver, 3).until(
                    EC.visibility_of_element_located((By.ID, 'headingText')))
                #open('1.html','w').write(self.driver.page_source)
                self.verify(el)
            except TimeoutException:  # We're in
                pass

    def screenshot(self, name):
        self.driver.save_screenshot('{}/{}-{}.png'.format(
            getcwd(), self.account['name'], name))

    def solve(self):
        '''Solve the captcha one time'''
        WebDriverWait(self.driver, 30).until(
            EC.frame_to_be_available_and_switch_to_it(
                (By.XPATH, '//iframe[@title="recaptcha widget"]')))

        el = WebDriverWait(self.driver, 20).until(
            EC.element_to_be_clickable(
                (By.CSS_SELECTOR, 'div.recaptcha-checkbox-checkmark')))
        #lg.info(el
        el.click()

        lg.debug('Clicked solve box')

        def check_style(driver, el):
            '''Now need to see what happened there. Check an attribute to see if we're successful.'''
            attr = el.get_attribute('aria-checked')
            lg.debug(attr)
            return attr == 'true'

        lg.debug('Before check_style')
        timeout = False
        try:
            WebDriverWait(self.driver, 20).until(lambda driver: check_style(
                driver, self.driver.find_element_by_id('recaptcha-anchor')))
        except TimeoutException:
            timeout = True  # Next (very soon) we'll see what happened

        lg.debug('Final: ' + self.driver.find_element_by_id(
            'recaptcha-anchor').get_attribute('aria-checked'))

        self.driver.switch_to.default_content()
        if timeout:
            lg.warning('Timeout')
            self.screenshot('timeout')
            el = self.driver.find_element_by_xpath(
                '//iframe[@title="recaptcha challenge"]')
            #set_trace()
            self.driver.switch_to.frame(el)
            l = len(self.driver.page_source)
            lg.debug(l)
            with open('recaptcha_main.html', 'w') as f:
                f.write(self.driver.page_source)
            if l > 10000:
                lg.warning('Captcha')
                self.screenshot('captcha')
                return True  # Need to quit
            self.driver.switch_to.default_content()
            self.driver.refresh()
        else:
            el = self.driver.find_element_by_id('submit')
            el.click()  # Submit button
            lg.info('Clicked submit')

            lg.debug('Before staleness')
            WebDriverWait(self.driver, 10,
                          poll_frequency=0.1).until(EC.staleness_of(el))
            lg.debug('After staleness')

    def create_driver(self):
        if 1:
            caps = DesiredCapabilities().FIREFOX.copy()

            profile_path = path.expanduser(
                '~') + '/.mozilla/firefox/' + self.account['name']

            # caps['proxy'] = {
            caps['moz:firefoxOptions'] = {
                "args": ["-profile", profile_path],  # geckodriver 0.18+
            }

            profile = FirefoxProfile(profile_path)
            #profile.set_preference("general.useragent.override", 'Mozilla/5.0 (X11; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0')

            self.driver = Firefox(profile, capabilities=caps)
            #self.driver = Firefox(profile)
        else:  # PhantomJS
            # https://github.com/detro/ghostdriver
            caps = DesiredCapabilities().PHANTOMJS
            caps["phantomjs.page.settings.userAgent"] = \
                'Mozilla/5.0 (X11; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0'
            service_args = [
                '--proxy={}'.format(':'.join(
                    self.account['Proxy'].split(':')[:2])),
                '--proxy-type=http',
            ]
            print(service_args)
            self.driver = PhantomJS(service_args=service_args,
                                    capabilities=caps)
            self.driver.set_window_size(1120, 550)
            #profile.set_preference("general.useragent.override","Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16")
        #profile.set_preference("general.useragent.override","Mozilla/5.0 (X11; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0")
        # profile.set_preference("browser.startup.homepage_override.mstone", "ignore");
        # profile.set_preference("startup.homepage_welcome_url.additional",  "about:blank");
        # profile.set_preference("xpinstall.signatures.required", "false");
        # profile.set_preference("toolkit.telemetry.reportingpolicy.firstRun", "false");

    def run(self):
        '''Login and run in cycle'''

        self.create_driver()

        try:
            self.login()

            tosleep=datetime.combine(
                date.today(), dt_time(drophour,00,5,tzinfo=timezone.utc))-\
                datetime.now(timezone.utc)
            tosleep = tosleep.seconds
            lg.info('Sleeping for {}'.format(tosleep))
            if '/pooh/' in path.expanduser('~'):
                tosleep = 0  # don't sleep on developer's host
            if not debug: sleep(tosleep)

            # Creating new window to work in (otherwise sometimes the page will ask whether we're ok to leave it)
            self.driver.execute_script(
                '''window.open('{}',"_blank");'''.format(solve_url))
            self.driver.switch_to.window(self.driver.window_handles[-1])
            lg.debug('Created new window')

            # Cycle here getting tokens until there are no more nocaptcha
            start_time = end_time = time()  # In case we have exception
            while True:
                #for i in range(1):
                if self.solve(): break
                self.solved += 1
            end_time = time()
        except:
            lg.exception('In run')
            self.screenshot('exception')
        finally:
            lg.warning('Closing driver')
            with suppress(WebDriverException):
                self.driver.quit()
        rate = (end_time - start_time) / self.solved if self.solved else 0
        lg.warning('Solved: {} ({:.2f})'.format(self.solved, rate))
예제 #4
0
class CNStock(SentimentCrawler):
    def __init__(self):
        super().__init__(init=False)
        self.driver = PhantomJS()
        self.driver.maximize_window()
        self.wait = WebDriverWait(self.driver, 15)
        self.url = 'http://www.cnstock.com/'
        self.name = '中国证券网'

    def crawl_main_page(self, keyword):
        self.driver.set_page_load_timeout(10)
        try:
            self.driver.get(self.url)
        except TimeoutException:
            self.driver.execute_script('window.stop();')

        try:
            self.wait.until(
                ec.presence_of_element_located((By.ID, 'nav_keywords')))
        except:
            CustomLogging.log_to_file('中国证券网打开失败', LogType.ERROR)

        self.driver.find_element_by_id('nav_keywords').clear()
        self.driver.find_element_by_id('nav_keywords').send_keys(keyword +
                                                                 Keys.ENTER)

        return self.crawl_search_results()

    def crawl_search_results(self):
        search_results = []
        self.driver.switch_to.window(self.driver.window_handles[-1])
        self.driver.maximize_window()

        exit_flag = 0
        while True:
            try:
                self.wait.until(
                    ec.presence_of_element_located(
                        (By.CLASS_NAME, 'result-cont')))
            except TimeoutException:
                CustomLogging.log_to_file('中国证券网搜索结果页错误', LogType.ERROR)
                break

            try:
                result_articles = self.driver.find_elements_by_class_name(
                    'result-article')

                for each_article in result_articles:
                    item = Entity()

                    publish_date = each_article.find_element_by_class_name(
                        'g').text
                    item.publish_date = re.search(
                        re.compile(
                            '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d'
                        ), publish_date).group()

                    if not in_date_range(
                            conv_pub_date(item.publish_date, 'cnstock'),
                            self.year_range):
                        exit_flag = 1
                        # 跳出for循环
                        break
                    item.short_description = each_article.find_element_by_class_name(
                        'des').text
                    item.title = each_article.find_element_by_tag_name(
                        'a').text
                    if self.keyword not in item.short_description and self.keyword not in item.title:
                        continue

                    if item.title in self.titles:
                        continue
                    else:
                        self.titles.append(item.title)

                    item.url = each_article.find_element_by_tag_name(
                        'a').get_attribute('href')
                    threading.Thread(target=super().download_and_save_item,
                                     args=(item, )).start()

                if exit_flag == 1:
                    break
            except NoSuchElementException:
                CustomLogging.log_to_file('没有搜索结果', LogType.INFO)
                break

            try:
                next_page = self.driver.find_element_by_xpath(
                    '//div[@class="pagination pagination-centered"]//a[contains(text(), "下一页")]'
                )
                self.driver.get(next_page.get_attribute('href'))
                # next_page.click()
            except NoSuchElementException:
                break

        return search_results

    def parse_html(self, url, html):
        bs = BeautifulSoup(html, 'lxml')
        try:
            full_content = bs.find('div', attrs={'id': 'qmt_content_div'}).text
            return full_content
        except Exception:
            CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url),
                                      LogType.ERROR)
            pass
예제 #5
0
from selenium.webdriver import PhantomJS

driver = PhantomJS(
    executable_path=
    r'E:\Documents\Apps\phantomjs-2.1.1-windows\bin\phantomjs.exe')
url = 'http://cxwh.kexing100.com:82/?app_act=detail&id=328&from=groupmessage'
driver.get(url)
while True:
    driver.refresh()
    print driver.find_element_by_xpath("//div[@class='xinfo']").text
    # driver.execute_script("return localStorage.setItem('toupiao','0')")
    driver.execute_script("return localStorage.removeItem('toupiao')")
    driver.delete_all_cookies()
    driver.refresh()
    vote = driver.find_element_by_xpath("//span/input[@class='btn1']").click()
    # break
예제 #6
0
    def get_applications_in_page(self, scroll_script):
        applications = []
        driver = None
        try:
            desired_capabilities = dict(DesiredCapabilities.PHANTOMJS)
            desired_capabilities["phantomjs.page.settings.userAgent"] = useragent.get_random_agent(google_prop.user_agent_list_url)
            service_args = ['--load-images=no', '--proxy=%s' % (proxy.get_random_proxy(google_prop.proxy_list_url))]
            driver = PhantomJS(desired_capabilities=desired_capabilities, service_args=service_args)
            # driver = Firefox(firefox_profile=self.fp, proxy=self.proxy)

            if self.proxy_test:
                driver.get('http://curlmyip.com/')
                ip = driver.find_element_by_xpath('//body//pre').text
                print('ip : [ ' + ip + ' ]')
                pass
            else:
                driver.get(self.url)
                driver.execute_script(scroll_script)

                acknowledge = 0
                done = False
                while not done:
                    scroll_finished = driver.execute_script("return scraperLoadCompleted")
                    if scroll_finished:
                        if acknowledge == self.acknowledgements:
                            done = driver.execute_script("return scraperLoadCompleted")
                            pass
                        else:
                            acknowledge += 1
                            pass
                        pass
                    else:
                        acknowledge = 0
                        pass
                    time.sleep(5)  # Wait before retry
                    pass

                product_matrix = driver.find_elements_by_class_name("card")
                for application in product_matrix:
                    extracted_application = self.extract_application_data(application)
                    # if extracted_application['app_price'] != -1:
                    applications.append(extracted_application)
                    #pass
                    pass
                pass
            driver.quit()
            pass

        except Exception as e:
            if driver is not None:
                driver.quit()
                pass

            if self.attempt < self.retries:
                self.attempt += 1
                time.sleep(10)
                print 'retry : url [ ' + self.url + ' ] + | attempt [ ' + str(self.attempt) + ' ] | error [ ' + str(e) + ' ]'
                applications = self.get_applications_in_page(scroll_script)
                pass
            else:
                print('fail : url [ ' + self.url + ' ] | error [ ' + str(e) + ' ]')
                pass
            pass
        return applications
        pass
예제 #7
0
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date,
                  invoice_amount):
    retail_invoice_url = RETAIL_INVOICE_URL[retail]

    driver = PhantomJS()
    driver.get(retail_invoice_url)

    # 1 Set doc_type 'select'
    try:
        select_doc_type = Select(driver.find_element_by_name('txtTipoDte'))
        value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value']
        select_doc_type.select_by_value(value)
        # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name']
        # select_doc_type.select_by_visible_text(name)
    except Exception:
        print 'ERROR: set doc_type select as Boleta'
        driver.save_screenshot('screen.png')
        return '', ''

    time.sleep(5)

    # 2 Get recaptcha img url
    try:
        recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image')
        recaptcha_img_url = recaptcha_img.get_attribute('src')
    except Exception:
        print 'ERROR: get recaptcha image url'
        driver.save_screenshot('screen.png')
        return '', ''

    # 3 Solve recaptcha
    v = VisionApi()
    recaptcha_value = v.detect_text_from_url(recaptcha_img_url)

    if recaptcha_value is None:
        print 'ERROR: solving recaptcha image'
        driver.save_screenshot('screen.png')
        return '', ''

    # 4 Fill form
    script = u"""
        document.getElementsByName('txtFolio')[0].value = '{invoice_id}';
        document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}';
        document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}';
        document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}';
    """.format(
        invoice_id=invoice_id,
        invoice_date=invoice_date,
        invoice_amount=invoice_amount,
        recaptcha_value=recaptcha_value,
    )
    driver.execute_script(script)

    # 5 Submit form
    try:
        driver.find_element_by_name('frmDatos').submit()
    except Exception:
        print 'ERROR: submitting form'
        driver.save_screenshot('screen.png')
        return '', ''

    # 6 Get url files
    try:
        xml_a_tag = driver.find_element_by_xpath(
            '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]')
        pdf_a_tag = driver.find_element_by_xpath(
            '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]')

        xml_url = xml_a_tag.get_attribute('href')
        pdf_url = pdf_a_tag.get_attribute('href')
    except Exception:
        print 'ERROR: getting url files'
        driver.save_screenshot('screen.png')
        return '', ''

    # 8 Delete driver session
    driver.close()
    driver.quit()

    return xml_url, pdf_url
예제 #8
0
class plugin:
    def __init__(self):
        APP_ROOT = os.path.dirname(os.path.abspath(__file__))
        print(APP_ROOT)
        self.req = 0
        self.driver = PhantomJS(APP_ROOT + "/phantomjs",
                                service_log_path=os.path.devnull)
        self.driver.implicitly_wait(3)

    def restart(self):
        self.__init__()

    def frame_search(self, path):
        framedict = {}
        for child_frame in self.driver.find_elements_by_tag_name('frame'):
            child_frame_name = child_frame.get_attribute('name')
            framedict[child_frame_name] = {'framepath': path, 'children': {}}
            xpath = '//frame[@name="{}"]'.format(child_frame_name)
            self.driver.switch_to.frame(
                self.driver.find_element_by_xpath(xpath))
            framedict[child_frame_name]['children'] = self.frame_search(
                framedict[child_frame_name]['framepath'] + [child_frame_name])

            self.driver.switch_to.default_content()
            if len(framedict[child_frame_name]['framepath']) > 0:
                for parent in framedict[child_frame_name]['framepath']:
                    parent_xpath = '//frame[@name="{}"]'.format(parent)
                    self.driver.switch_to.frame(
                        self.driver.find_element_by_xpath(parent_xpath))
        return framedict

    def tmon(self):
        self.driver.get(
            "https://login.ticketmonster.co.kr/user/loginform?return_url=")
        self.driver.find_element_by_name('userid').send_keys(
            config['ACCOUNT']['tmon_id'])
        self.driver.find_element_by_name('password').send_keys(
            config['ACCOUNT']['tmon_pw'])
        self.driver.find_element_by_xpath('//*[@id="loginFrm"]/a[2]').click()
        self.driver.get(
            'http://m.benefit.ticketmonster.co.kr/promotions/page/attendance?view_mode=app'
        )
        self.driver.find_element_by_xpath(
            '//*[@id="attn_wrap"]/div/div/div[3]/div[2]/div[1]/button').click(
            )

        print(self.driver.find_element_by_class_name('content').text)
        self.tmon_ret = self.driver.find_element_by_class_name('content').text

    def ondisk(self):
        try:
            self.driver.get("http://ondisk.co.kr/index.php")
            self.driver.implicitly_wait(3)
            self.driver.find_element_by_xpath('//*[@id="mb_id"]').send_keys(
                config['ACCOUNT']['ondisk_id'])
            self.driver.find_element_by_xpath(
                '//*[@id="page-login"]/form/div[2]/p[2]/input').send_keys(
                    config['ACCOUNT']['ondisk_pw'])
            self.driver.find_element_by_xpath(
                '//*[@id="page-login"]/form/div[2]/p[3]/input').click()
            self.driver.get(
                "http://ondisk.co.kr/index.php?mode=eventMarge&sm=event&action=view&idx=746&event_page=1"
            )
            self.driver.switch_to_frame(1)
            self.driver.execute_script(
                "window.alert = function(msg){ window.msg = msg; };")
            self.driver.find_element_by_class_name('button').click()

            alert_text = self.driver.execute_script("return window.msg;")
            print(alert_text)
        except:
            print("ERR")
            print(self.driver.page_source)
        self.ondisk_ret = alert_text

    def ok_cash_bag(self):
        today = datetime.datetime.now().strftime("%Y%m%d")
        sess = requests.session()
        getdata = sess.get(
            "https://www.facebook.com/login.php?login_attempt=1&next=https%3A%2F%2Fwww.facebook.com%2Fv2.6%2Fdialog%2Foauth%3Fredirect_uri%3Dhttps%253A%252F%252Fmember.okcashbag.com%252Focb%252FsocialId%252FfacebookProcessor%26scope%3Dpublic_profile%252Cuser_birthday%252Cemail%26client_id%3D645711852239977%26ret%3Dlogin%26logger_id%3D91698e1d-fe1e-b325-4c13-b62636843a9e&lwv=101"
        )
        param = {
            "lsd": "AVpmy4vJ",
            "api_key": "645711852239977",
            "cancel_url":
            "https://member.okcashbag.com/ocb/socialId/facebookProcessor?error=access_denied&error_code=200&error_description=Permissions+error&error_reason=user_denied#_=_",
            "display": "page",
            "enable_profile_selector": "",
            "isprivate": "",
            "legacy_return": "0",
            "profile_selector_ids": "",
            "return_session": "",
            "skip_api_login": "******",
            "signed_next": "1",
            "trynum": "1",
            "timezone": "-540",
            "lgndim":
            "eyJ3IjoxOTIwLCJoIjoxMDgwLCJhdyI6MTkyMCwiYWgiOjEwNDAsImMiOjI0fQ==",
            "lgnrnd": "173648_UqkK",
            "lgnjs": "1528418208",
            "email": config['ACCOUNT']['fb_id'],
            "pass": config['ACCOUNT']['fb_pw'],
            "prefill_contact_point": config['ACCOUNT']['fb_id'],
            "prefill_source": "last_login",
            "prefill_type": "contact_point",
            "first_prefill_source": "last_login",
            "first_prefill_type": "contact_point",
            "had_cp_prefilled": "true",
            "had_password_prefilled": "false"
        }
        postdata = sess.post(
            "https://www.facebook.com/login.php?login_attempt=1&next=https%3A%2F%2Fwww.facebook.com%2Fv2.6%2Fdialog%2Foauth%3Fredirect_uri%3Dhttps%253A%252F%252Fmember.okcashbag.com%252Focb%252FsocialId%252FfacebookProcessor%26scope%3Dpublic_profile%252Cuser_birthday%252Cemail%26client_id%3D645711852239977%26ret%3Dlogin%26logger_id%3D91698e1d-fe1e-b325-4c13-b62636843a9e&lwv=101",
            data=param)

        # print(postdata.text)
        postdata = sess.post(
            "https://member.okcashbag.com//ocb/socialId/socialIdLoginProcess/42100/687474703A2F2F7777772e6f6b636173686261672e636f6d2F696e6465782e646f3F6c6f67696e3D59"
        )
        samlResponse = postdata.text.split("samlResponse.value = \"")[1].split(
            "\"")[0]
        # print(samlResponse)
        param = {"samlResponse": samlResponse, "sst_cd": "", "return_url": ""}
        postdata = sess.post("http://www.okcashbag.com/index.do?login=Y",
                             data=param)
        print(
            postdata.text.split('<span id="profileNickname" class="name">')
            [1].split("</span>")[0] + "님 로그인")
        print(
            postdata.text.split('<span id="spanUsablePoint">')[1].split(
                '</span>')[0] + "포인트")
        getdata = sess.get(
            "http://www.okcashbag.com/life/event/attend/attendMain.do")
        param = {"method": "", "myUrl": "", "recommUser": "", "today": today}
        postdata = sess.post(
            "http://www.okcashbag.com/life/event/attend/attend.do", data=param)
        print(postdata.text)
        if len(postdata.text.split('<i class="win-point">')) > 1:
            print(postdata.text.split('<i class="win-point">')[1] + "포인트 적립")
        elif len(postdata.text.split("success")) > 1:
            print("출석체크 완료 ")
            self.ok_ret = "출석체크 완료"
        else:
            print('이미 출석체크 완료')
            self.ok_ret = "이미 출석체크 완료"
예제 #9
0
class HeadlessBrowser(object):
    def __init__(self):
        self.backend = ['chrome', 'phantomjs']
        self.driver = None
        atexit.register(self.cleanup)

    def __getattribute__(self, item):
        attr = object.__getattribute__(self, item)

        if hasattr(attr, '__call__'):
            func_name = attr.__name__

            if func_name in self.backend:
                def wrap_func(*args, **kwargs):
                    if self.driver is not None:
                        self.cleanup()

                    result = attr(*args, **kwargs)
                    return result
            else:
                def wrap_func(*args, **kwargs):
                    if self.driver is None:
                        logger.warning('Driver is NOT initialized, skip %s' % func_name)
                        return

                    result = attr(*args, **kwargs)
                    return result

            return wrap_func
        else:
            return attr

    def cleanup(self):
        if self.driver is not None:
            logger.info('CLEAN driver: %s' % self.driver)
            self.driver.quit()
            self.driver = None

    def chrome(self, chromedriver_path=None, disable_log=True, strip_ua4headless=True):
        """
        Better to place chromedriver and chrome/chromium binaries in the PATH,
            in this case, parameter chromedriver_path could be omitted and set as None
        Otherwise place them under the same directory and set parameter chromedriver_path
        ---------------------------------------------------------------------------------
        If chromedriver and chrome/chromium are in different path,
            beyond chromedriver_path setting, chrome/chromium path should be set as:
            options.binary_location = '/path'
        """
        options = ChromeOptions()
        options.add_argument('headless')
        options.add_argument('no-sandbox')

        if disable_log:
            options.add_argument('log-level=3')
            options.add_experimental_option('excludeSwitches', ['enable-logging'])

        try:
            if chromedriver_path:
                self.driver = Chrome(options=options,
                                     executable_path=chromedriver_path)
            else:
                self.driver = Chrome(options=options)
        except WebDriverException as e:
            logger.error(e.msg)
            self.driver = None
            return

        # self.driver.set_page_load_timeout(20)
        if strip_ua4headless:
            import re
            ua = re.sub('(?i)headless', '', self.ua())
            self.driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": ua})

    def phantomjs(self, exe_path=None, disable_log=True, log_path='logs/ghostdriver.log'):
        service_args = []
        if disable_log:
            service_args.append('--webdriver-loglevel=NONE')

        # I know phantomjs is deprecated, but I DO NOT LIKE the warnings...
        import warnings
        backup = warnings.warn
        warnings.warn = str

        try:
            if exe_path:
                self.driver = PhantomJS(executable_path=exe_path,
                                        service_args=service_args,
                                        service_log_path=log_path)
            else:
                self.driver = PhantomJS(service_args=service_args,
                                        service_log_path=log_path)
        except WebDriverException as e:
            logger.error(e.msg)
            self.driver = None
            return
        finally:
            warnings.warn = backup

    def get(self, url, report_html=False):
        if not urlparse(url).scheme:
            url = 'http://%s' % url

        self.driver.get(url)
        return self.driver.page_source if report_html else None

    def ua(self):
        return str(self.driver.execute_script("return navigator.userAgent"))

    def zoom(self, level=1):
        if isinstance(level, (int, float)):
            self.driver.execute_script("document.body.style.zoom = '%s'" % level)

    def capture(self, url, png_name=None, zoom_level=1):
        self.get(url)
        self.zoom(zoom_level)

        if png_name is None or not str(png_name).endswith('.png'):
            result = urlparse(url)
            if not result.scheme:
                result = urlparse('http://%s' % url)
                png_name = '%s.png' % result.netloc

        width = self.driver.execute_script(
            "return Math.max(document.body.scrollWidth, \
                             document.body.offsetWidth, \
                             document.documentElement.clientWidth, \
                             document.documentElement.scrollWidth, \
                             document.documentElement.offsetWidth);")

        height = self.driver.execute_script(
            "return Math.max(document.body.scrollHeight, \
                             document.body.offsetHeight, \
                             document.documentElement.clientHeight, \
                             document.documentElement.scrollHeight, \
                             document.documentElement.offsetHeight);")

        # resize
        self.driver.set_window_size(width, height)
        self.driver.save_screenshot(png_name)
class LegacySensCritique(object):

    CHANGEPAGE_TIMEOUT = 20
    '''
    Interact with SensCritique website
    '''
    def __init__(self, login, password, userAgent=LINUX_USER_AGENT):
        '''
        Constructor

        :param login:
        :param password:
        '''

        self.login = login
        self.password = password

        dcap = dict(DesiredCapabilities.PHANTOMJS)
        dcap["phantomjs.page.settings.userAgent"] = (userAgent)
        self.driver = PhantomJS(desired_capabilities=dcap)
        self.driver.set_window_size(1366, 768)

    def sign_in(self):
        '''
        Sign-in to SensCritique using the given login details

        :rtype: bool
        :Return: true if login succeeded, false otherwise
        '''

        self.to(HomePage())

        self.page.alreadySuscribed().click()

        self.page.loginField().send_keys(self.login)
        self.page.passwordField().send_keys(self.password)

        self.page.submitLoginButton().click()

        #TODO changing page so wait or something
        currentUser = self.page.username(self.CHANGEPAGE_TIMEOUT)

        if currentUser is not None:
            self._currentUsername = currentUser.value()
            logging.warn("Logged in with user " + self._currentUsername)

            return True
        else:
            if self.page.loginError() is not None:
                logging.error("Couldn't login : "******"/") + 1:]

        return l

    def deleteList(self, l: sclist):
        self.to(ListCollectionPage(self._currentUsername))

        for module in self.page.lists():
            if l.id() in module.url():

                # Alert box will be auto-accepted. Needed as Phantomjs cannot handle them
                self.driver.execute_script(
                    "window.confirm = function(msg) { return true; };")

                delete_button = module.delete_button()

                delete_action = ActionChains(self.driver)
                delete_action.move_to_element(module.title_node())
                delete_action.move_to_element(delete_button)
                delete_action.click(delete_button)

                delete_action.perform()

    def addMovie(self, movie: Movie, l: SCList):
        self.to(ListPage(l))

        self.page.query_input().send_keys(movie.title())

        add_button = self.page.add_movie_button(0)
        if add_button is None:
            return False  # Movie already in list

        if movie.description():
            self.page.movie_description_field(0).send_keys(movie.description())

        add_button.click()
        return True

    def deleteMovies(self, movies_to_delete, l: SCList):
        self.to(ListPage(l))

        for movie in self.page.movies():
            try:
                movies_to_delete.remove(movie.title())

                delete = movie.delete_button()
                delete.click()

                movie.confirm_delete_button().click()
                self.page.wait_loading_finished()
            except Exception as e:
                logging.error("Fail to delete movie " + movie.title() + ". " +
                              format(e))

        return movies_to_delete

    def to(self, page):
        page.to(self.driver)
        self.page = page

    def createSCListFromListModule(self, module: ListModule):
        list = sclist.SCList(module.id())

        list.setTitle(module.title())
        list.setDescription(module.description())
        list.setType(None)  # TODO: parse the type

        return list
class LegacySensCritique(object):

    CHANGEPAGE_TIMEOUT = 20

    '''
    Interact with SensCritique website
    '''

    def __init__(self, login, password, userAgent=LINUX_USER_AGENT):
        '''
        Constructor

        :param login:
        :param password:
        '''

        self.login = login
        self.password = password

        dcap = dict(DesiredCapabilities.PHANTOMJS)
        dcap["phantomjs.page.settings.userAgent"] = (
            userAgent
        )
        self.driver = PhantomJS(desired_capabilities=dcap)
        self.driver.set_window_size(1366, 768)

    def sign_in(self):
        '''
        Sign-in to SensCritique using the given login details

        :rtype: bool
        :Return: true if login succeeded, false otherwise
        '''

        self.to(HomePage())

        self.page.alreadySuscribed().click()

        self.page.loginField().send_keys(self.login)
        self.page.passwordField().send_keys(self.password)

        self.page.submitLoginButton().click()

        #TODO changing page so wait or something
        currentUser = self.page.username(self.CHANGEPAGE_TIMEOUT)

        if currentUser is not None:
            self._currentUsername = currentUser.value()
            logging.warn("Logged in with user " + self._currentUsername)

            return True
        else:
            if self.page.loginError() is not None:
                logging.error("Couldn't login : "******"/") + 1:]

        return l

    def deleteList(self, l : sclist):
        self.to(ListCollectionPage(self._currentUsername))

        for module in self.page.lists():
            if l.id() in module.url():

                # Alert box will be auto-accepted. Needed as Phantomjs cannot handle them
                self.driver.execute_script("window.confirm = function(msg) { return true; };")

                delete_button = module.delete_button()

                delete_action = ActionChains(self.driver)
                delete_action.move_to_element(module.title_node())
                delete_action.move_to_element(delete_button)
                delete_action.click(delete_button)

                delete_action.perform()

    def addMovie(self, movie: Movie, l : SCList):
        self.to(ListPage(l))

        self.page.query_input().send_keys(movie.title())

        add_button = self.page.add_movie_button(0)
        if add_button is None:
            return False  # Movie already in list

        if movie.description():
            self.page.movie_description_field(0).send_keys(movie.description())

        add_button.click()
        return True

    def deleteMovies(self, movies_to_delete, l : SCList):
        self.to(ListPage(l))

        for movie in self.page.movies():
            try:
                movies_to_delete.remove(movie.title())

                delete = movie.delete_button()
                delete.click()

                movie.confirm_delete_button().click()
                self.page.wait_loading_finished()
            except Exception as e:
                logging.error("Fail to delete movie " + movie.title() + ". " + format(e))

        return movies_to_delete

    def to(self, page):
        page.to(self.driver)
        self.page = page

    def createSCListFromListModule(self, module : ListModule):
        list = sclist.SCList(module.id())

        list.setTitle(module.title())
        list.setDescription(module.description())
        list.setType(None)  # TODO: parse the type

        return list
예제 #12
0
,但是该网址拒绝查看源代码,通过观察进一步发现给原网址添加一个view-source:
就可以出现园代码了,即view-source:http://ac.qq.com/ComicView/index/id/521825/cid/1
"""

from selenium.webdriver import PhantomJS, DesiredCapabilities
import time
import re

header = DesiredCapabilities.CHROME.copy()  # DesiredCapabilities可以伪装谷歌浏览器
web = PhantomJS(desired_capabilities=header,
                executable_path='F:/phantomjs-2.1.1-windows/bin/phantomjs'
                )  # 需要设置PhantomJS的路径,否则无法运行
web.maximize_window()  # 设置浏览器屏幕最大化
web.get('http://ac.qq.com/ComicView/index/id/521825/cid/1')  # 获取网页
web.get_screenshot_as_file(
    './abc.png')  # 网页截图,可以看到一个网页图片,以png的格式保存到指定位置,名称为abc.png

for page in range(1, 30):  # window.scrollTo(0,{})往下翻页
    web.execute_script('window.scrollTo(0,{})'.format(
        1080 *
        page))  # execute_script表示执行翻页的脚本,1080*1表示第一页,1080*2表示第二页,以此类推。。。
    time.sleep(1)
web.get_screenshot_as_file('./abc.png')  # 下载最后一页

pat = 'https://manhua.qpic.cn/vertical/0/(.*?)"'  # 通过正则获取图片地址
ls = re.compile(pat, re.S).findall(web.page_source)  # web.page_source表示源代码

import urllib.request as r
for i in range(len(ls)):
    r.urlretrieve("http://www.baidu.com", filename="F:\pa/aa.html")
예제 #13
0
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date, invoice_amount):
    retail_invoice_url = RETAIL_INVOICE_URL[retail]

    driver = PhantomJS()
    driver.get(retail_invoice_url)

    # 1 Set doc_type 'select'
    try:
        select_doc_type = Select(driver.find_element_by_name('txtTipoDte'))
        value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value']
        select_doc_type.select_by_value(value)
        # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name']
        # select_doc_type.select_by_visible_text(name)
    except Exception:
        print 'ERROR: set doc_type select as Boleta'
        driver.save_screenshot('screen.png')
        return '', ''

    time.sleep(5)

    # 2 Get recaptcha img url
    try:
        recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image')
        recaptcha_img_url = recaptcha_img.get_attribute('src')
    except Exception:
        print 'ERROR: get recaptcha image url'
        driver.save_screenshot('screen.png')
        return '', ''

    # 3 Solve recaptcha
    v = VisionApi()
    recaptcha_value = v.detect_text_from_url(recaptcha_img_url)

    if recaptcha_value is None:
        print 'ERROR: solving recaptcha image'
        driver.save_screenshot('screen.png')
        return '', ''

    # 4 Fill form
    script = u"""
        document.getElementsByName('txtFolio')[0].value = '{invoice_id}';
        document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}';
        document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}';
        document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}';
    """.format(
        invoice_id=invoice_id,
        invoice_date=invoice_date,
        invoice_amount=invoice_amount,
        recaptcha_value=recaptcha_value,
    )
    driver.execute_script(script)

    # 5 Submit form
    try:
        driver.find_element_by_name('frmDatos').submit()
    except Exception:
        print 'ERROR: submitting form'
        driver.save_screenshot('screen.png')
        return '', ''

    # 6 Get url files
    try:
        xml_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]')
        pdf_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]')

        xml_url = xml_a_tag.get_attribute('href')
        pdf_url = pdf_a_tag.get_attribute('href')
    except Exception:
        print 'ERROR: getting url files'
        driver.save_screenshot('screen.png')
        return '', ''

    # 8 Delete driver session
    driver.close()
    driver.quit()

    return xml_url, pdf_url
예제 #14
0
파일: pages.py 프로젝트: sYnHybrid/hyphe
class PagesCrawler(Spider):

    name = 'pages'
    link_extractor = RegexpLinkExtractor(canonicalize=False,
                                         deny_extensions=[])
    ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS])

    def __init__(self, **kwargs):
        mongo = MongoClient(MONGO_HOST, MONGO_PORT)[MONGO_DB][MONGO_JOBS_COL]
        job = mongo.find_one({"_id": kwargs["job_id"]})
        args = job["crawl_arguments"]
        self.args = args
        self.start_urls = to_list(args['start_urls'])
        self.maxdepth = int(args['max_depth'])
        self.follow_prefixes = to_list(args['follow_prefixes'])
        self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
        self.prefixes_trie = LRUTrie()
        for p in self.follow_prefixes:
            self.prefixes_trie.set_lru(p, True)
        for p in self.nofollow_prefixes:
            self.prefixes_trie.set_lru(p, False)
        self.discover_prefixes = [
            url_to_lru_clean(
                "http%s://%s" %
                (https, u.replace('http://', '').replace('https://', '')),
                TLDS_TREE) for u in to_list(args['discover_prefixes'])
            for https in ['', 's']
        ]
        self.resolved_links = {}
        self.user_agent = args['user_agent']
        self.phantom = 'phantom' in args and args[
            'phantom'] and args['phantom'].lower() != "false"
        self.cookies = None
        if 'cookies' in args and args["cookies"]:
            self.cookies = dict(
                cookie.split('=', 1)
                for cookie in re.split(r'\s*;\s*', args['cookies'])
                if '=' in cookie)
        if self.phantom:
            self.ph_timeout = int(
                args.get('phantom_timeout', PHANTOM['TIMEOUT']))
            self.ph_idle_timeout = int(
                args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
            self.ph_ajax_timeout = int(
                args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
        self.errors = 0

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(PagesCrawler, cls).from_crawler(crawler, *args,
                                                       **kwargs)
        crawler.signals.connect(spider.spider_closed, signal=spider_closed)
        crawler.signals.connect(spider.spider_crashed, signal=spider_error)
        return spider

    def start_requests(self):
        self.log(
            "Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'],
            logging.INFO)
        self.log("ARGUMENTS : " + str(self.args), logging.INFO)
        if self.phantom:
            self.init_phantom()
        for url in self.start_urls:
            yield self._request(url)

    def init_phantom(self):
        self.prefixfiles = os.path.join(scrapyd_config().get('logs_dir'),
                                        HYPHE_PROJECT, self.name,
                                        self.crawler.settings['JOBID'])
        self.log("Using path %s for PhantomJS crawl" % self.prefixfiles,
                 logging.INFO)
        phantom_args = []
        if PROXY and not PROXY.startswith(':'):
            phantom_args.append('--proxy=%s' % PROXY)
        phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' %
                            self.prefixfiles)
        phantom_args.append('--ignore-ssl-errors=true')
        phantom_args.append('--load-images=false')
        self.capabilities = dict(DesiredCapabilities.PHANTOMJS)
        self.capabilities[
            'phantomjs.page.settings.userAgent'] = self.user_agent
        self.capabilities['takesScreenshot'] = False
        self.capabilities[
            'phantomjs.page.settings.javascriptCanCloseWindows'] = False
        self.capabilities[
            'phantomjs.page.settings.javascriptCanOpenWindows'] = False
        self.phantom = PhantomJS(executable_path=PHANTOM['PATH'],
                                 service_args=phantom_args,
                                 desired_capabilities=self.capabilities,
                                 service_log_path="%s-phantomjs.log" %
                                 self.prefixfiles)
        self.phantom.implicitly_wait(10)
        self.phantom.set_page_load_timeout(60)
        self.phantom.set_script_timeout(self.ph_timeout + 15)

    def spider_crashed(self, spider):
        self.errors += 1
        self.spider_closed(spider, reason="CRASH")

    def spider_closed(self, spider, reason=""):
        if self.errors:
            self.log(
                "%s error%s encountered during the crawl (%s)." %
                (self.errors, 's' if self.errors > 1 else '', reason),
                logging.ERROR)
        if self.phantom:
            self.phantom.quit()
            if not self.errors:
                for f in ["phantomjs-cookie.txt", "phantomjs.log"]:
                    fi = "%s-%s" % (self.prefixfiles, f)
                    if os.path.exists(fi) and not self.errors:
                        os.remove(fi)

    def handle_response(self, response):
        lru = url_to_lru_clean(response.url, TLDS_TREE)

        if self.phantom:
            self.phantom.get(response.url)

            # Collect whole DOM of the webpage including embedded iframes
            with open(
                    os.path.join(PHANTOM["JS_PATH"],
                                 "get_iframes_content.js")) as js:
                get_bod_w_iframes = js.read()
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

            # Try to scroll and unfold page
            self.log("Start PhantomJS scrolling and unfolding", logging.INFO)
            with open(
                    os.path.join(PHANTOM["JS_PATH"],
                                 "scrolldown_and_unfold.js")) as js:
                try:
                    signal.signal(signal.SIGALRM, timeout_alarm)
                    signal.alarm(self.ph_timeout + 30)
                    timedout = self.phantom.execute_async_script(
                        js.read(), self.ph_timeout, self.ph_idle_timeout,
                        self.ph_ajax_timeout)
                    signal.alarm(0)
                    if timedout:
                        raise SeleniumTimeout
                    self.log("Scrolling/Unfolding finished", logging.INFO)
                except SeleniumTimeout:
                    self.log(
                        "Scrolling/Unfolding timed-out (%ss)" %
                        self.ph_timeout, logging.WARNING)
                    self.errors += 1
                except WebDriverException as e:
                    err = json.loads(e.msg)['errorMessage']
                    self.log("Scrolling/Unfolding crashed: %s" % err,
                             logging.ERROR)
                    self.errors += 1
                except Exception as e:
                    self.log(
                        "Scrolling/Unfolding crashed: %s %s" % (type(e), e),
                        logging.ERROR)
                    self.errors += 1
                    return self._make_raw_page(response, lru)
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

    # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses
        if response.status == 200 and not isinstance(response, HtmlResponse):
            try:
                flags = response.flags
                if "partial" in flags:
                    flags.remove("partial")
                flags.append("cleaned")
                response = HtmlResponse(response.url,
                                        headers=response.headers,
                                        body=cleanupbase64images(
                                            response.body),
                                        flags=flags,
                                        request=response.request)
                self.log(
                    "WARNING: page with base64 embedded images was cleaned-up for links extraction"
                )
            except:
                pass

        if 300 < response.status < 400 or isinstance(response, HtmlResponse):
            return self.parse_html(response, lru)
        else:
            return self._make_raw_page(response, lru)

    def handle_error(self, failure, response=None):
        if response:
            p = self._make_raw_page(response, failure.request.url)
            p['error'] = error_name(failure.value)
            return p
        elif not "://www" in failure.request.url:
            return self._request(failure.request.url.replace('://', '://www.'))
        error = failure.getErrorMessage()
        self.log("ERROR : %s" % error, logging.ERROR)
        if PROXY and not PROXY.startswith(
                ':') and "OpenSSL.SSL.Error" in error:
            return self._request(failure.request.url, noproxy=True)
        self.errors += 1
        return

    def parse_html(self, response, lru):
        lrulinks = []
        # handle redirects
        realdepth = response.meta['depth']
        if 300 < response.status < 400:
            redir_url = response.headers['Location']
            if redir_url.startswith('/'):
                redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'),
                                      redir_url)
            elif redir_url.startswith('../'):
                lrustart = lru[:lru.rfind('|p:')]
                while redir_url.startswith('../'):
                    lrustart = lrustart[:lrustart.rfind('|p:')]
                    redir_url = redir_url[3:]
                redir_url = "%s/%s" % (lru_to_url(lrustart + '|'), redir_url)
            elif redir_url.startswith(
                    './') or not redir_url.startswith('http'):
                redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'),
                                      redir_url[1:])
            links = [{'url': redir_url}]
            response.meta['depth'] -= 1
        else:
            try:
                links = self.link_extractor.extract_links(response)
            except Exception as e:
                self.log(
                    "ERROR: links extractor crashed on %s: %s %s" %
                    (response, type(e), e), logging.ERROR)
                links = []
                self.errors += 1
        for link in links:
            try:
                url = link.url
            except AttributeError:
                url = link['url']
            try:
                lrulink = url_to_lru_clean(url, TLDS_TREE)
            except (ValueError, IndexError) as e:
                self.log("Error converting URL %s to LRU: %s" % (url, e),
                         logging.ERROR)
                continue
            lrulinks.append((url, lrulink))
            if self._should_follow(response.meta['depth'], lrulink) and \
                    not url_has_any_extension(url, self.ignored_exts):
                yield self._request(url)
        response.meta['depth'] = realdepth
        yield self._make_html_page(response, lru, lrulinks)

    def _make_html_page(self, response, lru, lrulinks):
        p = self._make_raw_page(response, lru)
        if STORE_HTML:
            p['body'] = Binary(response.body.encode('zip'))
        p['lrulinks'] = lrulinks
        return p

    def _make_raw_page(self, response, lru):
        p = self._new_page(response.url, lru)
        p['status'] = response.status
        p['size'] = len(response.body)
        if isinstance(response, HtmlResponse):
            p['encoding'] = response.encoding
        if response.meta.get('depth'):
            p['depth'] = response.meta['depth']
        if response.headers.get('content-type'):
            p['content_type'] = response.headers.get('content-type').partition(
                ';')[0]
        p['error'] = None
        return p

    def _new_page(self, url, lru=None):
        if lru is None:
            lru = url_to_lru_clean(url, TLDS_TREE)
        p = Page()
        p['url'] = url
        p['lru'] = lru
        p['depth'] = 0
        p['timestamp'] = int(time.time() * 1000)
        return p

    def _should_follow(self, depth, tolru):
        c1 = depth < self.maxdepth
        c2 = self.prefixes_trie.match_lru(tolru)
        return c1 and c2

    def _request(self, url, noproxy=False, **kw):
        kw['meta'] = {'handle_httpstatus_all': True, 'noproxy': noproxy}
        kw['callback'] = self.handle_response
        kw['errback'] = self.handle_error
        if self.cookies:
            kw['cookies'] = self.cookies
        if self.phantom:
            kw['method'] = 'HEAD'
        return Request(url, **kw)
예제 #15
0
파일: pages.py 프로젝트: noscripter/hyphe
class PagesCrawler(BaseSpider):

    name = 'pages'
    link_extractor = RegexpLinkExtractor(canonicalize=False,
                                         deny_extensions=[])
    ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS])

    def __init__(self, **kw):
        args = DEFAULT_INPUT.copy()
        args.update(kw)
        self.args = args
        self.start_urls = to_list(args['start_urls'])
        self.maxdepth = int(args['maxdepth'])
        self.follow_prefixes = to_list(args['follow_prefixes'])
        self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
        self.discover_prefixes = [
            url_to_lru_clean(
                "http%s://%s" %
                (https, u.replace('http://', '').replace('https://', '')))
            for u in to_list(args['discover_prefixes']) for https in ['', 's']
        ]
        self.resolved_links = {}
        self.user_agent = args['user_agent']
        self.phantom = 'phantom' in args and args[
            'phantom'] and args['phantom'].lower() != "false"
        if self.phantom:
            self.ph_timeout = int(
                args.get('phantom_timeout', PHANTOM['TIMEOUT']))
            self.ph_idle_timeout = int(
                args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
            self.ph_ajax_timeout = int(
                args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
        self.errors = 0
        dispatcher.connect(self.closed, spider_closed)
        dispatcher.connect(self.crashed, spider_error)

    def start_requests(self):
        self.log(
            "Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'],
            log.INFO)
        self.log("ARGUMENTS : " + str(self.args), log.INFO)
        if self.phantom:
            self.init_phantom()
        for url in self.start_urls:
            yield self._request(url)

    def init_phantom(self):
        self.prefixfiles = os.path.join(scrapyd_config().get('logs_dir'),
                                        HYPHE_PROJECT, self.name,
                                        self.crawler.settings['JOBID'])
        self.log("Using path %s for PhantomJS crawl" % self.prefixfiles,
                 log.INFO)
        phantom_args = []
        if PROXY and not PROXY.startswith(':'):
            phantom_args.append('--proxy=%s' % PROXY)
        phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' %
                            self.prefixfiles)
        phantom_args.append('--ignore-ssl-errors=true')
        phantom_args.append('--load-images=false')
        self.capabilities = dict(DesiredCapabilities.PHANTOMJS)
        self.capabilities[
            'phantomjs.page.settings.userAgent'] = self.user_agent
        self.capabilities['takesScreenshot'] = False
        self.capabilities[
            'phantomjs.page.settings.javascriptCanCloseWindows'] = False
        self.capabilities[
            'phantomjs.page.settings.javascriptCanOpenWindows'] = False
        self.phantom = PhantomJS(executable_path=PHANTOM['PATH'],
                                 service_args=phantom_args,
                                 desired_capabilities=self.capabilities,
                                 service_log_path="%s-phantomjs.log" %
                                 self.prefixfiles)
        self.phantom.implicitly_wait(10)
        self.phantom.set_page_load_timeout(60)
        self.phantom.set_script_timeout(self.ph_timeout + 15)

    def crashed(self, spider):
        self.errors += 1
        self.closed("CRASH")

    def closed(self, reason):
        if self.errors:
            self.log(
                "%s error%s encountered during the crawl." %
                (self.errors, 's' if self.errors > 1 else ''), log.ERROR)
        if self.phantom:
            self.phantom.quit()
            if not self.errors:
                for f in ["phantomjs-cookie.txt", "phantomjs.log"]:
                    fi = "%s-%s" % (self.prefixfiles, f)
                    if os.path.exists(fi) and not self.errors:
                        os.remove(fi)

    def handle_response(self, response):
        lru = url_to_lru_clean(response.url)

        if self.phantom:
            self.phantom.get(response.url)

            # Collect whole DOM of the webpage including embedded iframes
            with open(
                    os.path.join(PHANTOM["JS_PATH"],
                                 "get_iframes_content.js")) as js:
                get_bod_w_iframes = js.read()
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

            # Try to scroll and unfold page
            self.log("Start PhantomJS scrolling and unfolding", log.INFO)
            with open(
                    os.path.join(PHANTOM["JS_PATH"],
                                 "scrolldown_and_unfold.js")) as js:
                try:
                    signal.signal(signal.SIGALRM, timeout_alarm)
                    signal.alarm(self.ph_timeout + 30)
                    timedout = self.phantom.execute_async_script(
                        js.read(), self.ph_timeout, self.ph_idle_timeout,
                        self.ph_ajax_timeout)
                    signal.alarm(0)
                    if timedout:
                        raise SeleniumTimeout
                    self.log("Scrolling/Unfolding finished", log.INFO)
                except SeleniumTimeout:
                    self.log(
                        "Scrolling/Unfolding timed-out (%ss)" %
                        self.ph_timeout, log.WARNING)
                    self.errors += 1
                except WebDriverException as e:
                    err = json.loads(e.msg)['errorMessage']
                    self.log("Scrolling/Unfolding crashed: %s" % err,
                             log.ERROR)
                    self.errors += 1
                except Exception as e:
                    self.log(
                        "Scrolling/Unfolding crashed: %s %s" % (type(e), e),
                        log.ERROR)
                    self.errors += 1
                    return self._make_raw_page(response, lru)
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

    # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses
        if response.status == 200 and not isinstance(response, HtmlResponse):
            try:
                flags = response.flags
                if "partial" in flags:
                    flags.remove("partial")
                flags.append("cleaned")
                response = HtmlResponse(response.url,
                                        headers=response.headers,
                                        body=cleanupbase64images(
                                            response.body),
                                        flags=flags,
                                        request=response.request)
                self.log(
                    "WARNING: page with base64 embedded images was cleaned-up for links extraction"
                )
            except:
                pass

        if 300 < response.status < 400 or isinstance(response, HtmlResponse):
            return self.parse_html(response, lru)
        else:
            return self._make_raw_page(response, lru)

    def handle_error(self, failure, response=None):
        if response:
            p = self._make_raw_page(response, failure.request.url)
            p['error'] = error_name(failure.value)
            return p
        elif not "://www" in failure.request.url:
            return self._request(failure.request.url.replace('://', '://www.'))
        error = failure.getErrorMessage()
        self.log("ERROR : %s" % error, log.ERROR)
        if PROXY and not PROXY.startswith(
                ':') and "OpenSSL.SSL.Error" in error:
            return self._request(failure.request.url, noproxy=True)
        self.errors += 1
        return

    def parse_html(self, response, lru):
        lrulinks = []
        # handle redirects
        realdepth = response.meta['depth']
        if 300 < response.status < 400:
            redir_url = response.headers['Location']
            if redir_url.startswith('/'):
                redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'),
                                      redir_url)
            elif redir_url.startswith(
                    './') or not redir_url.startswith('http'):
                redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'),
                                      redir_url[1:])
            links = [{'url': redir_url}]
            response.meta['depth'] -= 1
        else:
            try:
                links = self.link_extractor.extract_links(response)
            except Exception as e:
                self.log(
                    "ERROR: links extractor crashed on %s: %s %s" %
                    (response, type(e), e), log.ERROR)
                links = []
                self.errors += 1
        for link in links:
            try:
                url = link.url
            except AttributeError:
                url = link['url']
            try:
                lrulink = url_to_lru_clean(url)
            except ValueError, e:
                self.log("Error converting URL %s to LRU: %s" % (url, e),
                         log.ERROR)
                continue
            lrulinks.append((url, lrulink))
            if self._should_follow(response.meta['depth'], lru, lrulink) and \
                    not url_has_any_extension(url, self.ignored_exts):
                yield self._request(url)
        response.meta['depth'] = realdepth
        yield self._make_html_page(response, lru, lrulinks)
예제 #16
0
class PagesCrawler(BaseSpider):

    name = 'pages'
    link_extractor = RegexpLinkExtractor(canonicalize=False, deny_extensions=[])
    ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS])

    def __init__(self, **kw):
        args = DEFAULT_INPUT.copy()
        args.update(kw)
        self.args = args
        self.start_urls = to_list(args['start_urls'])
        self.maxdepth = int(args['maxdepth'])
        self.follow_prefixes = to_list(args['follow_prefixes'])
        self.nofollow_prefixes = to_list(args['nofollow_prefixes'])
        self.discover_prefixes = [url_to_lru_clean("http%s://%s" % (https, u.replace('http://', '').replace('https://', ''))) for u in to_list(args['discover_prefixes']) for https in ['', 's']]
        self.resolved_links = {}
        self.user_agent = args['user_agent']
        self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false"
        if self.phantom:
            self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT']))
            self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT']))
            self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT']))
        self.errors = 0
        dispatcher.connect(self.closed, spider_closed)
        dispatcher.connect(self.crashed, spider_error)

    def start_requests(self):
        self.log("Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], log.INFO)
        self.log("ARGUMENTS : "+str(self.args), log.INFO)
        if self.phantom:
            self.init_phantom()
        for url in self.start_urls:
            yield self._request(url)

    def init_phantom(self):
        self.prefixfiles = os.path.join(
            scrapyd_config().get('logs_dir'),
            HYPHE_PROJECT,
            self.name,
            self.crawler.settings['JOBID']
        )
        self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, log.INFO)
        phantom_args = []
        if PROXY and not PROXY.startswith(':'):
            phantom_args.append('--proxy=%s' % PROXY)
        phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles)
        phantom_args.append('--ignore-ssl-errors=true')
        phantom_args.append('--load-images=false')
        self.capabilities = dict(DesiredCapabilities.PHANTOMJS)
        self.capabilities['phantomjs.page.settings.userAgent'] = self.user_agent
        self.capabilities['takesScreenshot'] = False
        self.capabilities['phantomjs.page.settings.javascriptCanCloseWindows'] = False
        self.capabilities['phantomjs.page.settings.javascriptCanOpenWindows'] = False
        self.phantom = PhantomJS(
            executable_path=PHANTOM['PATH'],
            service_args=phantom_args,
            desired_capabilities=self.capabilities,
            service_log_path="%s-phantomjs.log" % self.prefixfiles
        )
        self.phantom.implicitly_wait(10)
        self.phantom.set_page_load_timeout(60)
        self.phantom.set_script_timeout(self.ph_timeout + 15)

    def crashed(self, spider):
        self.errors += 1
        self.closed("CRASH")

    def closed(self, reason):
        if self.errors:
            self.log("%s error%s encountered during the crawl." %
                (self.errors, 's' if self.errors > 1 else ''), log.ERROR)
        if self.phantom:
            self.phantom.quit()
            if not self.errors:
                for f in ["phantomjs-cookie.txt", "phantomjs.log"]:
                    fi = "%s-%s" % (self.prefixfiles, f)
                    if os.path.exists(fi) and not self.errors:
                        os.remove(fi)

    def handle_response(self, response):
        lru = url_to_lru_clean(response.url)

        if self.phantom:
            self.phantom.get(response.url)

          # Collect whole DOM of the webpage including embedded iframes
            with open(os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js:
                get_bod_w_iframes = js.read()
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

          # Try to scroll and unfold page
            self.log("Start PhantomJS scrolling and unfolding", log.INFO)
            with open(os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js:
                try:
                    signal.signal(signal.SIGALRM, timeout_alarm)
                    signal.alarm(self.ph_timeout + 30)
                    timedout = self.phantom.execute_async_script(
                        js.read(), self.ph_timeout,
                        self.ph_idle_timeout, self.ph_ajax_timeout)
                    signal.alarm(0)
                    if timedout:
                        raise SeleniumTimeout
                    self.log("Scrolling/Unfolding finished", log.INFO)
                except SeleniumTimeout:
                    self.log("Scrolling/Unfolding timed-out (%ss)" % self.ph_timeout, log.WARNING)
                    self.errors += 1
                except WebDriverException as e:
                    err = json.loads(e.msg)['errorMessage']
                    self.log("Scrolling/Unfolding crashed: %s" % err, log.ERROR)
                    self.errors += 1
                except Exception as e:
                    self.log("Scrolling/Unfolding crashed: %s %s" % (type(e), e), log.ERROR)
                    self.errors += 1
                    return self._make_raw_page(response, lru)
            bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes)
            response._set_body(bod_w_iframes.encode('utf-8'))

      # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses
        if response.status == 200 and not isinstance(response, HtmlResponse):
            try:
                flags = response.flags
                if "partial" in flags:
                    flags.remove("partial")
                flags.append("cleaned")
                response = HtmlResponse(response.url, headers=response.headers, body=cleanupbase64images(response.body), flags=flags, request=response.request)
                self.log("WARNING: page with base64 embedded images was cleaned-up for links extraction")
            except:
                pass

        if 300 < response.status < 400 or isinstance(response, HtmlResponse):
            return self.parse_html(response, lru)
        else:
            return self._make_raw_page(response, lru)

    def handle_error(self, failure, response=None):
        if response:
            p = self._make_raw_page(response, failure.request.url)
            p['error'] = error_name(failure.value)
            return p
        elif not "://www" in failure.request.url:
            return self._request(failure.request.url.replace('://', '://www.'))
        self.log("ERROR : %s" % failure.getErrorMessage(), log.ERROR)
        self.errors += 1
        return

    def parse_html(self, response, lru):
        lrulinks = []
        # handle redirects
        realdepth = response.meta['depth']
        if 300 < response.status < 400:
            redir_url = response.headers['Location']
            if redir_url.startswith('/'):
                redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url)
            elif redir_url.startswith('./') or not redir_url.startswith('http'):
                redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:])
            links = [{'url': redir_url}]
            response.meta['depth'] -= 1
        else:
            try:
                links = self.link_extractor.extract_links(response)
            except Exception as e:
                self.log("ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), log.ERROR)
                links = []
                self.errors += 1
        for link in links:
            try:
                url = link.url
            except AttributeError:
                url = link['url']
            try:
                lrulink = url_to_lru_clean(url)
            except ValueError, e:
                self.log("Error converting URL %s to LRU: %s" % (url, e), log.ERROR)
                continue
            lrulinks.append((url, lrulink))
            if self._should_follow(response.meta['depth'], lru, lrulink) and \
                    not url_has_any_extension(url, self.ignored_exts):
                yield self._request(url)
        response.meta['depth'] = realdepth
        yield self._make_html_page(response, lru, lrulinks)
class CamaraCGCrawler(object):
    """ Camara CG Ementa Crawler """
    def __init__(self, starting_year):
        self.base_url = "http://187.115.174.90:8080/ScanLexWeb"
        self.starting_year = starting_year
        self.browser = None

    @staticmethod
    def get_ementa_id(published_date, ementa_type, ementa_doc_number,
                      ementa_situation):
        """ Return the Ementa Unique Id """
        return "%s#%s#%s#%s" % (datetime.strftime(
            published_date,
            "%Y-%m-%d"), ementa_type, ementa_doc_number, ementa_situation)

    def get_all_ementas_summary(self):
        """ Yield the next ementa information row """

        browser_table = self.browser.find_element_by_id(
            "frmMenu:tabEmentas_data")
        bs_ementa_table = BeautifulSoup(
            browser_table.get_attribute("innerHTML"))

        for row in bs_ementa_table.find_all("tr"):
            cols = row.find_all("td")
            if len(cols) == 6:
                published_date = datetime.strptime(
                    cols[0].span.text.encode("utf-8"), "%d/%m/%Y")
                doc_number = int(cols[1].span.text.encode("utf-8"))
                title = cols[2].span.text.encode("utf-8")
                ementa_type = cols[3].span.text.encode("utf-8")
                ementa_situation = cols[4].span.text.encode("utf-8")
                details_js = cols[5].a['onclick'].encode("utf-8")

                if published_date > datetime.now():
                    continue

                yield published_date, doc_number, title, ementa_type, ementa_situation, details_js

    def get_ementa_details(self, ementa_details_js):
        """ Crawl the second ementa page """

        # Waiting...
        _ = WebDriverWait(self.browser, 30).until(
            EC.visibility_of_element_located(
                (By.ID, "frmfuncao:j_idt13_content")))
        _ = WebDriverWait(self.browser, 30).until(
            EC.visibility_of_element_located(
                (By.ID, "frmfuncao:tabProponentes")))

        # Get Ementail Details
        bs_ementa_details = BeautifulSoup(self.browser \
            .find_element_by_id("frmfuncao:j_idt13_content").get_attribute("innerHTML"))

        rows = bs_ementa_details.find_all("tr")

        source = rows[3].td.text
        main_theme = rows[7].td.text
        sys_enter_date = datetime.strptime(rows[9].td.text, "%d/%m/%Y")
        approval_date = datetime.strptime(rows[11].td.text, "%d/%m/%Y")
        process_number = int(rows[15].td.text or "-1")
        autograph_number = int(rows[19].td.text or "-1")
        process_year = int(rows[21].td.text or "-1")
        has_image = rows[23].td.text == "Sim"

        # Get Proponent names
        bs_proponent = BeautifulSoup(
            self.browser.find_element_by_id(
                "frmfuncao:tabProponentes").get_attribute("innerHTML"))

        proponents = ",".join(
            [col.text for col in bs_proponent.find_all("td")])

        return source, proponents, main_theme, sys_enter_date, approval_date, process_number, \
            autograph_number, process_year, has_image

    def next_ementa(self, select_curs):
        """ Iterate in the years onwards and collect all the ementas """

        try:
            LOGGER.info("Opening Browser")
            self.browser = PhantomJS()

            LOGGER.info("GET [%s]", self.base_url)
            self.browser.maximize_window()

            cur_year = int(datetime.now().year)

            # Define the initial collection year
            select_curs.execute(
                "SELECT EXTRACT (YEAR FROM MAX(published_date)) FROM ementas;")
            last_exec_year = select_curs.fetchone()
            if last_exec_year:
                collection_year = max(self.starting_year, last_exec_year[0])
            else:
                collection_year = self.starting_year

            all_proponents = [
                "ANDERSON MAIA", "Afonso Alexandre Régis",
                "Alcides Cavalcante", "Alcindor Villarim", "Aldo Cabral",
                "Alexandre do Sindicato", "Antonio Pereira",
                "Antônio Alves Pimentel Filho", "Aragão Júnior",
                "Bruno Cunha Lima Branco", "Bruno Gaudêncio", "Buchada",
                "Cassiano Pascoal", "Cozete Babosa",
                "Cássio Murilo Galdino de Araujo", "Daniella Ribeiro",
                "Dr. Nunes", "Executivo", "Fabrinni Brito",
                "Fernando carvalho", "Francisco Dantas Lira",
                "Galego do Leite", "Inacio Falcao", "Ivan Batista",
                "Ivonete Ludgerio", "Joao Dantas", "Josimar Henrique da Silva",
                "José Marcos Raia ", "José Ribamar", "João Dantas",
                "Jóia Germano", "Laelson Patricio", "Lafite",
                "Lindaci Medeiros Nápolis", "Lourdes Costa", "Lula Cabral",
                "Marcos Marinho", "Maria Lopes Barbosa", "Marinaldo Cardoso",
                "Metuselá Agra", "Miguel Rodrigues da Silva",
                "Miguel da Construção", "Napoleão Maracajá",
                "Nelson Gomes Filho", "Olimpio Oliveira", "Orlandino Farias",
                "Paulo Muniz", "Paulo de Tarso", "Peron Ribeiro Japiassú",
                "Renato Feliciano", "Rodolfo Rodrigues",
                "Rodrigo Ramos Victor", "Romero Rodrigues", "Rostand Paraíba",
                "Rômulo Gouveia", "Saulo Germano", "Saulo Noronha", "Tia Mila",
                "Tovar Correia Lima", "Vaninho Aragão",
                "Veneziano Vital do rego", "Walter Brito Neto", "Todos"
            ]

            while collection_year <= cur_year:

                for i_prop in range(len(all_proponents)):

                    ementa_prop = all_proponents[i_prop].decode("utf-8")

                    self.browser.get(self.base_url)

                    # Waiting...
                    WebDriverWait(self.browser, 30).until(
                        EC.element_to_be_clickable((By.ID, "frmMenu:button1")))

                    LOGGER.info("Collecting Ementas from [%d][%s - %d/%d]",
                                collection_year, ementa_prop, i_prop + 1,
                                len(all_proponents))

                    # Set Year
                    year_field = self.browser.find_element_by_id("frmMenu:ano")
                    year_field.send_keys(collection_year)

                    # Set Proponent
                    proponent_field = self.browser.find_element_by_id(
                        "frmMenu:autoridade")
                    proponent_field.send_keys(ementa_prop)

                    # Submit the form
                    self.browser.find_element_by_id("frmMenu:button1").click()

                    # Waiting...
                    # _ = WebDriverWait(self.browser, 60).until(EC.visibility_of_element_located((By.ID, "frmMenu:tabEmentas_data")))
                    time.sleep(3)

                    for published_date, document_number, title, ementa_type, ementa_situation, ementa_details_js in self.get_all_ementas_summary(
                    ):
                        ementa_id = self.get_ementa_id(published_date,
                                                       ementa_type,
                                                       document_number,
                                                       ementa_situation)

                        select_curs.execute("""
                            SELECT ementa_id
                            FROM ementas
                            WHERE ementa_id = '%s';
                            """ % ementa_id)

                        if not select_curs.fetchone():
                            # Run the details script
                            self.browser.execute_script(ementa_details_js)
                            ementa_source, proponents, main_theme, sys_enter_date, approval_date, \
                                process_number, autograph_number, process_year, has_image = self.get_ementa_details(ementa_details_js)

                            # Come back to the table page
                            self.browser.back()

                            # Waiting...
                            _ = WebDriverWait(self.browser, 60).until(
                                EC.visibility_of_element_located(
                                    (By.ID, "frmMenu:tabEmentas_data")))

                            yield ementa_id, published_date, ementa_type, document_number, title, \
                                ementa_source, proponents, ementa_situation, main_theme, sys_enter_date, \
                                approval_date, process_number, autograph_number, process_year, has_image

                LOGGER.info("DONE [%d]", collection_year)

                self.browser.back()

                collection_year += 1

        finally:
            if self.browser:
                self.browser.quit()