class Client: def __init__(self, ig_id): self.b = PhantomJS() self.ig_id = ig_id self.b.get('https://instagram.com/%s' % ig_id) def close(self): self.b.close() def get_media(self) -> list: js = self.b.execute_script('return window._sharedData;') ed = js['entry_data'] pp = ed['PostPage'][0] g = pp['graphql'] sc = g['shortcode_media'] if sc['__typename'] == 'GraphSidecar': edges = sc['edge_sidecar_to_children']['edges'] medias = list( map( lambda x: { 'id': x['node']['id'], 'url': x['node']['display_url'], 'caption': x['node']['accessibility_caption'] }, edges)) elif sc['__typename'] == 'GraphImage': medias = [{ 'id': sc['id'], 'url': sc['display_url'], 'caption': sc['accessibility_caption'] }] return list( filter( lambda x: 'person' in x['caption'] or 'people' in x['caption'], medias)) def get_user(self) -> dict: js = self.b.execute_script('return window._sharedData;') ed = js['entry_data'] pp = ed['ProfilePage'][0] g = pp['graphql'] return g['user'] def get_posts(self) -> set: ps = self.b.find_elements_by_css_selector('a[href^="/p/"]') return set(map(lambda x: x.get_attribute('href'), ps)) def scroll(self): self.b.execute_script('window.scroll(0, document.body.scrollHeight);')
def generate_image(structure): image_path = os.path.join(mkdtemp(), 'okc.png') html_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'okc.html', ) url = 'file://{}'.format(html_path) driver = PhantomJS(service_log_path=mkstemp()[1]) driver.set_window_size(2000, 500) driver.get(url) driver.execute_script('setText({});'.format(json.dumps(structure))) if random() > 0.4: driver.execute_script('hideForm();') elif random() > 0.5: driver.execute_script('uncheckForm();') driver.set_window_size(*driver.execute_script('return getSize();')) driver.save_screenshot(image_path) # twitter's gonna make our beautiful screenshot a jpeg unless we make it # think that we're using transparency for a reason, so,, img = Image.open(image_path) origin = img.getpixel((0, 0)) new_origin = origin[:3] + (254,) img.putpixel((0, 0), new_origin) img.save(image_path) subprocess.check_call(['optipng', '-quiet', image_path]) return image_path
class gmail(Thread): def __init__(self, account): name = account['name'] super().__init__(name=name) # Thread __init__ lg.warning('{0[name]}, proxy: {0[Proxy]}'.format(account)) self.account = account self.solved = 0 if 0: # Getting cookies snippet print(self.driver.get_cookies()) cookies = { _['name']: _['value'] for _ in self.driver.get_cookies() } with open('cookies.json', 'w') as f: dump(cookies, f, indent=4) def verify(self, el): '''Verifies the account. May be untrivial:(''' text = el.text # get_attribute('value') lg.info('Text: {}'.format(text)) if text == "Verify it's you": lg.debug('Verify') #el=self.driver.find_element_by_id('identifierNext') el = self.driver.find_element_by_xpath( '//div[.="Confirm your recovery email"]') print(el) el.click() el = WebDriverWait(self.driver, 3).until( EC.visibility_of_element_located( (By.NAME, 'knowledgePreregisteredEmailResponse'))) el.send_keys(account[2]) # recovery email def login(self): if 0: # to test #'https://www.whoishostingthis.com/tools/user-agent/' self.driver.get('about:about') sleep(1000) #self.driver.get('https://mail.google.com') self.driver.get( 'https://accounts.google.com/signin/v2/identifier?continue=https%3A%2F%2Fmail.google.com%2Fmail%2F&service=mail&sacu=1&rip=1&flowName=GlifWebSignIn&flowEntry=ServiceLogin' ) prefilled = False lg.debug('Logging in with {}'.format(self.account)) try: el = WebDriverWait(self.driver, 2).until( EC.visibility_of_element_located((By.ID, 'identifierId'))) except TimeoutException: prefilled = True if prefilled: lg.info('Username prefilled already') else: lg.debug('Entering username') el.send_keys(self.account['name']) # username nxt = self.driver.find_element_by_id('identifierNext') nxt.click() logged_in = False try: el = WebDriverWait(self.driver, 20).until( EC.visibility_of_element_located((By.NAME, 'password'))) except TimeoutException: # We're logged in? # TODO: Check for something visible after being logged in # Because we may genuinely be in timeout logged_in = True if logged_in: lg.info('Logged in already') else: lg.debug('Entering password') el.send_keys(self.account['Second Password']) nxt = WebDriverWait(self.driver, 5).until( EC.element_to_be_clickable((By.ID, 'passwordNext'))) nxt.click() # WebDriverWait(self.driver, 60).until( # EC.frame_to_be_available_and_switch_to_it((By.ID, 'tab1_1')) # ) try: el = WebDriverWait(self.driver, 3).until( EC.visibility_of_element_located((By.ID, 'headingText'))) #open('1.html','w').write(self.driver.page_source) self.verify(el) except TimeoutException: # We're in pass def screenshot(self, name): self.driver.save_screenshot('{}/{}-{}.png'.format( getcwd(), self.account['name'], name)) def solve(self): '''Solve the captcha one time''' WebDriverWait(self.driver, 30).until( EC.frame_to_be_available_and_switch_to_it( (By.XPATH, '//iframe[@title="recaptcha widget"]'))) el = WebDriverWait(self.driver, 20).until( EC.element_to_be_clickable( (By.CSS_SELECTOR, 'div.recaptcha-checkbox-checkmark'))) #lg.info(el el.click() lg.debug('Clicked solve box') def check_style(driver, el): '''Now need to see what happened there. Check an attribute to see if we're successful.''' attr = el.get_attribute('aria-checked') lg.debug(attr) return attr == 'true' lg.debug('Before check_style') timeout = False try: WebDriverWait(self.driver, 20).until(lambda driver: check_style( driver, self.driver.find_element_by_id('recaptcha-anchor'))) except TimeoutException: timeout = True # Next (very soon) we'll see what happened lg.debug('Final: ' + self.driver.find_element_by_id( 'recaptcha-anchor').get_attribute('aria-checked')) self.driver.switch_to.default_content() if timeout: lg.warning('Timeout') self.screenshot('timeout') el = self.driver.find_element_by_xpath( '//iframe[@title="recaptcha challenge"]') #set_trace() self.driver.switch_to.frame(el) l = len(self.driver.page_source) lg.debug(l) with open('recaptcha_main.html', 'w') as f: f.write(self.driver.page_source) if l > 10000: lg.warning('Captcha') self.screenshot('captcha') return True # Need to quit self.driver.switch_to.default_content() self.driver.refresh() else: el = self.driver.find_element_by_id('submit') el.click() # Submit button lg.info('Clicked submit') lg.debug('Before staleness') WebDriverWait(self.driver, 10, poll_frequency=0.1).until(EC.staleness_of(el)) lg.debug('After staleness') def create_driver(self): if 1: caps = DesiredCapabilities().FIREFOX.copy() profile_path = path.expanduser( '~') + '/.mozilla/firefox/' + self.account['name'] # caps['proxy'] = { caps['moz:firefoxOptions'] = { "args": ["-profile", profile_path], # geckodriver 0.18+ } profile = FirefoxProfile(profile_path) #profile.set_preference("general.useragent.override", 'Mozilla/5.0 (X11; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0') self.driver = Firefox(profile, capabilities=caps) #self.driver = Firefox(profile) else: # PhantomJS # https://github.com/detro/ghostdriver caps = DesiredCapabilities().PHANTOMJS caps["phantomjs.page.settings.userAgent"] = \ 'Mozilla/5.0 (X11; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0' service_args = [ '--proxy={}'.format(':'.join( self.account['Proxy'].split(':')[:2])), '--proxy-type=http', ] print(service_args) self.driver = PhantomJS(service_args=service_args, capabilities=caps) self.driver.set_window_size(1120, 550) #profile.set_preference("general.useragent.override","Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16") #profile.set_preference("general.useragent.override","Mozilla/5.0 (X11; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0") # profile.set_preference("browser.startup.homepage_override.mstone", "ignore"); # profile.set_preference("startup.homepage_welcome_url.additional", "about:blank"); # profile.set_preference("xpinstall.signatures.required", "false"); # profile.set_preference("toolkit.telemetry.reportingpolicy.firstRun", "false"); def run(self): '''Login and run in cycle''' self.create_driver() try: self.login() tosleep=datetime.combine( date.today(), dt_time(drophour,00,5,tzinfo=timezone.utc))-\ datetime.now(timezone.utc) tosleep = tosleep.seconds lg.info('Sleeping for {}'.format(tosleep)) if '/pooh/' in path.expanduser('~'): tosleep = 0 # don't sleep on developer's host if not debug: sleep(tosleep) # Creating new window to work in (otherwise sometimes the page will ask whether we're ok to leave it) self.driver.execute_script( '''window.open('{}',"_blank");'''.format(solve_url)) self.driver.switch_to.window(self.driver.window_handles[-1]) lg.debug('Created new window') # Cycle here getting tokens until there are no more nocaptcha start_time = end_time = time() # In case we have exception while True: #for i in range(1): if self.solve(): break self.solved += 1 end_time = time() except: lg.exception('In run') self.screenshot('exception') finally: lg.warning('Closing driver') with suppress(WebDriverException): self.driver.quit() rate = (end_time - start_time) / self.solved if self.solved else 0 lg.warning('Solved: {} ({:.2f})'.format(self.solved, rate))
class CNStock(SentimentCrawler): def __init__(self): super().__init__(init=False) self.driver = PhantomJS() self.driver.maximize_window() self.wait = WebDriverWait(self.driver, 15) self.url = 'http://www.cnstock.com/' self.name = '中国证券网' def crawl_main_page(self, keyword): self.driver.set_page_load_timeout(10) try: self.driver.get(self.url) except TimeoutException: self.driver.execute_script('window.stop();') try: self.wait.until( ec.presence_of_element_located((By.ID, 'nav_keywords'))) except: CustomLogging.log_to_file('中国证券网打开失败', LogType.ERROR) self.driver.find_element_by_id('nav_keywords').clear() self.driver.find_element_by_id('nav_keywords').send_keys(keyword + Keys.ENTER) return self.crawl_search_results() def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() exit_flag = 0 while True: try: self.wait.until( ec.presence_of_element_located( (By.CLASS_NAME, 'result-cont'))) except TimeoutException: CustomLogging.log_to_file('中国证券网搜索结果页错误', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_class_name( 'result-article') for each_article in result_articles: item = Entity() publish_date = each_article.find_element_by_class_name( 'g').text item.publish_date = re.search( re.compile( '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d' ), publish_date).group() if not in_date_range( conv_pub_date(item.publish_date, 'cnstock'), self.year_range): exit_flag = 1 # 跳出for循环 break item.short_description = each_article.find_element_by_class_name( 'des').text item.title = each_article.find_element_by_tag_name( 'a').text if self.keyword not in item.short_description and self.keyword not in item.title: continue if item.title in self.titles: continue else: self.titles.append(item.title) item.url = each_article.find_element_by_tag_name( 'a').get_attribute('href') threading.Thread(target=super().download_and_save_item, args=(item, )).start() if exit_flag == 1: break except NoSuchElementException: CustomLogging.log_to_file('没有搜索结果', LogType.INFO) break try: next_page = self.driver.find_element_by_xpath( '//div[@class="pagination pagination-centered"]//a[contains(text(), "下一页")]' ) self.driver.get(next_page.get_attribute('href')) # next_page.click() except NoSuchElementException: break return search_results def parse_html(self, url, html): bs = BeautifulSoup(html, 'lxml') try: full_content = bs.find('div', attrs={'id': 'qmt_content_div'}).text return full_content except Exception: CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url), LogType.ERROR) pass
from selenium.webdriver import PhantomJS driver = PhantomJS( executable_path= r'E:\Documents\Apps\phantomjs-2.1.1-windows\bin\phantomjs.exe') url = 'http://cxwh.kexing100.com:82/?app_act=detail&id=328&from=groupmessage' driver.get(url) while True: driver.refresh() print driver.find_element_by_xpath("//div[@class='xinfo']").text # driver.execute_script("return localStorage.setItem('toupiao','0')") driver.execute_script("return localStorage.removeItem('toupiao')") driver.delete_all_cookies() driver.refresh() vote = driver.find_element_by_xpath("//span/input[@class='btn1']").click() # break
def get_applications_in_page(self, scroll_script): applications = [] driver = None try: desired_capabilities = dict(DesiredCapabilities.PHANTOMJS) desired_capabilities["phantomjs.page.settings.userAgent"] = useragent.get_random_agent(google_prop.user_agent_list_url) service_args = ['--load-images=no', '--proxy=%s' % (proxy.get_random_proxy(google_prop.proxy_list_url))] driver = PhantomJS(desired_capabilities=desired_capabilities, service_args=service_args) # driver = Firefox(firefox_profile=self.fp, proxy=self.proxy) if self.proxy_test: driver.get('http://curlmyip.com/') ip = driver.find_element_by_xpath('//body//pre').text print('ip : [ ' + ip + ' ]') pass else: driver.get(self.url) driver.execute_script(scroll_script) acknowledge = 0 done = False while not done: scroll_finished = driver.execute_script("return scraperLoadCompleted") if scroll_finished: if acknowledge == self.acknowledgements: done = driver.execute_script("return scraperLoadCompleted") pass else: acknowledge += 1 pass pass else: acknowledge = 0 pass time.sleep(5) # Wait before retry pass product_matrix = driver.find_elements_by_class_name("card") for application in product_matrix: extracted_application = self.extract_application_data(application) # if extracted_application['app_price'] != -1: applications.append(extracted_application) #pass pass pass driver.quit() pass except Exception as e: if driver is not None: driver.quit() pass if self.attempt < self.retries: self.attempt += 1 time.sleep(10) print 'retry : url [ ' + self.url + ' ] + | attempt [ ' + str(self.attempt) + ' ] | error [ ' + str(e) + ' ]' applications = self.get_applications_in_page(scroll_script) pass else: print('fail : url [ ' + self.url + ' ] | error [ ' + str(e) + ' ]') pass pass return applications pass
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date, invoice_amount): retail_invoice_url = RETAIL_INVOICE_URL[retail] driver = PhantomJS() driver.get(retail_invoice_url) # 1 Set doc_type 'select' try: select_doc_type = Select(driver.find_element_by_name('txtTipoDte')) value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value'] select_doc_type.select_by_value(value) # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name'] # select_doc_type.select_by_visible_text(name) except Exception: print 'ERROR: set doc_type select as Boleta' driver.save_screenshot('screen.png') return '', '' time.sleep(5) # 2 Get recaptcha img url try: recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image') recaptcha_img_url = recaptcha_img.get_attribute('src') except Exception: print 'ERROR: get recaptcha image url' driver.save_screenshot('screen.png') return '', '' # 3 Solve recaptcha v = VisionApi() recaptcha_value = v.detect_text_from_url(recaptcha_img_url) if recaptcha_value is None: print 'ERROR: solving recaptcha image' driver.save_screenshot('screen.png') return '', '' # 4 Fill form script = u""" document.getElementsByName('txtFolio')[0].value = '{invoice_id}'; document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}'; document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}'; document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}'; """.format( invoice_id=invoice_id, invoice_date=invoice_date, invoice_amount=invoice_amount, recaptcha_value=recaptcha_value, ) driver.execute_script(script) # 5 Submit form try: driver.find_element_by_name('frmDatos').submit() except Exception: print 'ERROR: submitting form' driver.save_screenshot('screen.png') return '', '' # 6 Get url files try: xml_a_tag = driver.find_element_by_xpath( '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]') pdf_a_tag = driver.find_element_by_xpath( '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]') xml_url = xml_a_tag.get_attribute('href') pdf_url = pdf_a_tag.get_attribute('href') except Exception: print 'ERROR: getting url files' driver.save_screenshot('screen.png') return '', '' # 8 Delete driver session driver.close() driver.quit() return xml_url, pdf_url
class plugin: def __init__(self): APP_ROOT = os.path.dirname(os.path.abspath(__file__)) print(APP_ROOT) self.req = 0 self.driver = PhantomJS(APP_ROOT + "/phantomjs", service_log_path=os.path.devnull) self.driver.implicitly_wait(3) def restart(self): self.__init__() def frame_search(self, path): framedict = {} for child_frame in self.driver.find_elements_by_tag_name('frame'): child_frame_name = child_frame.get_attribute('name') framedict[child_frame_name] = {'framepath': path, 'children': {}} xpath = '//frame[@name="{}"]'.format(child_frame_name) self.driver.switch_to.frame( self.driver.find_element_by_xpath(xpath)) framedict[child_frame_name]['children'] = self.frame_search( framedict[child_frame_name]['framepath'] + [child_frame_name]) self.driver.switch_to.default_content() if len(framedict[child_frame_name]['framepath']) > 0: for parent in framedict[child_frame_name]['framepath']: parent_xpath = '//frame[@name="{}"]'.format(parent) self.driver.switch_to.frame( self.driver.find_element_by_xpath(parent_xpath)) return framedict def tmon(self): self.driver.get( "https://login.ticketmonster.co.kr/user/loginform?return_url=") self.driver.find_element_by_name('userid').send_keys( config['ACCOUNT']['tmon_id']) self.driver.find_element_by_name('password').send_keys( config['ACCOUNT']['tmon_pw']) self.driver.find_element_by_xpath('//*[@id="loginFrm"]/a[2]').click() self.driver.get( 'http://m.benefit.ticketmonster.co.kr/promotions/page/attendance?view_mode=app' ) self.driver.find_element_by_xpath( '//*[@id="attn_wrap"]/div/div/div[3]/div[2]/div[1]/button').click( ) print(self.driver.find_element_by_class_name('content').text) self.tmon_ret = self.driver.find_element_by_class_name('content').text def ondisk(self): try: self.driver.get("http://ondisk.co.kr/index.php") self.driver.implicitly_wait(3) self.driver.find_element_by_xpath('//*[@id="mb_id"]').send_keys( config['ACCOUNT']['ondisk_id']) self.driver.find_element_by_xpath( '//*[@id="page-login"]/form/div[2]/p[2]/input').send_keys( config['ACCOUNT']['ondisk_pw']) self.driver.find_element_by_xpath( '//*[@id="page-login"]/form/div[2]/p[3]/input').click() self.driver.get( "http://ondisk.co.kr/index.php?mode=eventMarge&sm=event&action=view&idx=746&event_page=1" ) self.driver.switch_to_frame(1) self.driver.execute_script( "window.alert = function(msg){ window.msg = msg; };") self.driver.find_element_by_class_name('button').click() alert_text = self.driver.execute_script("return window.msg;") print(alert_text) except: print("ERR") print(self.driver.page_source) self.ondisk_ret = alert_text def ok_cash_bag(self): today = datetime.datetime.now().strftime("%Y%m%d") sess = requests.session() getdata = sess.get( "https://www.facebook.com/login.php?login_attempt=1&next=https%3A%2F%2Fwww.facebook.com%2Fv2.6%2Fdialog%2Foauth%3Fredirect_uri%3Dhttps%253A%252F%252Fmember.okcashbag.com%252Focb%252FsocialId%252FfacebookProcessor%26scope%3Dpublic_profile%252Cuser_birthday%252Cemail%26client_id%3D645711852239977%26ret%3Dlogin%26logger_id%3D91698e1d-fe1e-b325-4c13-b62636843a9e&lwv=101" ) param = { "lsd": "AVpmy4vJ", "api_key": "645711852239977", "cancel_url": "https://member.okcashbag.com/ocb/socialId/facebookProcessor?error=access_denied&error_code=200&error_description=Permissions+error&error_reason=user_denied#_=_", "display": "page", "enable_profile_selector": "", "isprivate": "", "legacy_return": "0", "profile_selector_ids": "", "return_session": "", "skip_api_login": "******", "signed_next": "1", "trynum": "1", "timezone": "-540", "lgndim": "eyJ3IjoxOTIwLCJoIjoxMDgwLCJhdyI6MTkyMCwiYWgiOjEwNDAsImMiOjI0fQ==", "lgnrnd": "173648_UqkK", "lgnjs": "1528418208", "email": config['ACCOUNT']['fb_id'], "pass": config['ACCOUNT']['fb_pw'], "prefill_contact_point": config['ACCOUNT']['fb_id'], "prefill_source": "last_login", "prefill_type": "contact_point", "first_prefill_source": "last_login", "first_prefill_type": "contact_point", "had_cp_prefilled": "true", "had_password_prefilled": "false" } postdata = sess.post( "https://www.facebook.com/login.php?login_attempt=1&next=https%3A%2F%2Fwww.facebook.com%2Fv2.6%2Fdialog%2Foauth%3Fredirect_uri%3Dhttps%253A%252F%252Fmember.okcashbag.com%252Focb%252FsocialId%252FfacebookProcessor%26scope%3Dpublic_profile%252Cuser_birthday%252Cemail%26client_id%3D645711852239977%26ret%3Dlogin%26logger_id%3D91698e1d-fe1e-b325-4c13-b62636843a9e&lwv=101", data=param) # print(postdata.text) postdata = sess.post( "https://member.okcashbag.com//ocb/socialId/socialIdLoginProcess/42100/687474703A2F2F7777772e6f6b636173686261672e636f6d2F696e6465782e646f3F6c6f67696e3D59" ) samlResponse = postdata.text.split("samlResponse.value = \"")[1].split( "\"")[0] # print(samlResponse) param = {"samlResponse": samlResponse, "sst_cd": "", "return_url": ""} postdata = sess.post("http://www.okcashbag.com/index.do?login=Y", data=param) print( postdata.text.split('<span id="profileNickname" class="name">') [1].split("</span>")[0] + "님 로그인") print( postdata.text.split('<span id="spanUsablePoint">')[1].split( '</span>')[0] + "포인트") getdata = sess.get( "http://www.okcashbag.com/life/event/attend/attendMain.do") param = {"method": "", "myUrl": "", "recommUser": "", "today": today} postdata = sess.post( "http://www.okcashbag.com/life/event/attend/attend.do", data=param) print(postdata.text) if len(postdata.text.split('<i class="win-point">')) > 1: print(postdata.text.split('<i class="win-point">')[1] + "포인트 적립") elif len(postdata.text.split("success")) > 1: print("출석체크 완료 ") self.ok_ret = "출석체크 완료" else: print('이미 출석체크 완료') self.ok_ret = "이미 출석체크 완료"
class HeadlessBrowser(object): def __init__(self): self.backend = ['chrome', 'phantomjs'] self.driver = None atexit.register(self.cleanup) def __getattribute__(self, item): attr = object.__getattribute__(self, item) if hasattr(attr, '__call__'): func_name = attr.__name__ if func_name in self.backend: def wrap_func(*args, **kwargs): if self.driver is not None: self.cleanup() result = attr(*args, **kwargs) return result else: def wrap_func(*args, **kwargs): if self.driver is None: logger.warning('Driver is NOT initialized, skip %s' % func_name) return result = attr(*args, **kwargs) return result return wrap_func else: return attr def cleanup(self): if self.driver is not None: logger.info('CLEAN driver: %s' % self.driver) self.driver.quit() self.driver = None def chrome(self, chromedriver_path=None, disable_log=True, strip_ua4headless=True): """ Better to place chromedriver and chrome/chromium binaries in the PATH, in this case, parameter chromedriver_path could be omitted and set as None Otherwise place them under the same directory and set parameter chromedriver_path --------------------------------------------------------------------------------- If chromedriver and chrome/chromium are in different path, beyond chromedriver_path setting, chrome/chromium path should be set as: options.binary_location = '/path' """ options = ChromeOptions() options.add_argument('headless') options.add_argument('no-sandbox') if disable_log: options.add_argument('log-level=3') options.add_experimental_option('excludeSwitches', ['enable-logging']) try: if chromedriver_path: self.driver = Chrome(options=options, executable_path=chromedriver_path) else: self.driver = Chrome(options=options) except WebDriverException as e: logger.error(e.msg) self.driver = None return # self.driver.set_page_load_timeout(20) if strip_ua4headless: import re ua = re.sub('(?i)headless', '', self.ua()) self.driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": ua}) def phantomjs(self, exe_path=None, disable_log=True, log_path='logs/ghostdriver.log'): service_args = [] if disable_log: service_args.append('--webdriver-loglevel=NONE') # I know phantomjs is deprecated, but I DO NOT LIKE the warnings... import warnings backup = warnings.warn warnings.warn = str try: if exe_path: self.driver = PhantomJS(executable_path=exe_path, service_args=service_args, service_log_path=log_path) else: self.driver = PhantomJS(service_args=service_args, service_log_path=log_path) except WebDriverException as e: logger.error(e.msg) self.driver = None return finally: warnings.warn = backup def get(self, url, report_html=False): if not urlparse(url).scheme: url = 'http://%s' % url self.driver.get(url) return self.driver.page_source if report_html else None def ua(self): return str(self.driver.execute_script("return navigator.userAgent")) def zoom(self, level=1): if isinstance(level, (int, float)): self.driver.execute_script("document.body.style.zoom = '%s'" % level) def capture(self, url, png_name=None, zoom_level=1): self.get(url) self.zoom(zoom_level) if png_name is None or not str(png_name).endswith('.png'): result = urlparse(url) if not result.scheme: result = urlparse('http://%s' % url) png_name = '%s.png' % result.netloc width = self.driver.execute_script( "return Math.max(document.body.scrollWidth, \ document.body.offsetWidth, \ document.documentElement.clientWidth, \ document.documentElement.scrollWidth, \ document.documentElement.offsetWidth);") height = self.driver.execute_script( "return Math.max(document.body.scrollHeight, \ document.body.offsetHeight, \ document.documentElement.clientHeight, \ document.documentElement.scrollHeight, \ document.documentElement.offsetHeight);") # resize self.driver.set_window_size(width, height) self.driver.save_screenshot(png_name)
class LegacySensCritique(object): CHANGEPAGE_TIMEOUT = 20 ''' Interact with SensCritique website ''' def __init__(self, login, password, userAgent=LINUX_USER_AGENT): ''' Constructor :param login: :param password: ''' self.login = login self.password = password dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = (userAgent) self.driver = PhantomJS(desired_capabilities=dcap) self.driver.set_window_size(1366, 768) def sign_in(self): ''' Sign-in to SensCritique using the given login details :rtype: bool :Return: true if login succeeded, false otherwise ''' self.to(HomePage()) self.page.alreadySuscribed().click() self.page.loginField().send_keys(self.login) self.page.passwordField().send_keys(self.password) self.page.submitLoginButton().click() #TODO changing page so wait or something currentUser = self.page.username(self.CHANGEPAGE_TIMEOUT) if currentUser is not None: self._currentUsername = currentUser.value() logging.warn("Logged in with user " + self._currentUsername) return True else: if self.page.loginError() is not None: logging.error("Couldn't login : "******"/") + 1:] return l def deleteList(self, l: sclist): self.to(ListCollectionPage(self._currentUsername)) for module in self.page.lists(): if l.id() in module.url(): # Alert box will be auto-accepted. Needed as Phantomjs cannot handle them self.driver.execute_script( "window.confirm = function(msg) { return true; };") delete_button = module.delete_button() delete_action = ActionChains(self.driver) delete_action.move_to_element(module.title_node()) delete_action.move_to_element(delete_button) delete_action.click(delete_button) delete_action.perform() def addMovie(self, movie: Movie, l: SCList): self.to(ListPage(l)) self.page.query_input().send_keys(movie.title()) add_button = self.page.add_movie_button(0) if add_button is None: return False # Movie already in list if movie.description(): self.page.movie_description_field(0).send_keys(movie.description()) add_button.click() return True def deleteMovies(self, movies_to_delete, l: SCList): self.to(ListPage(l)) for movie in self.page.movies(): try: movies_to_delete.remove(movie.title()) delete = movie.delete_button() delete.click() movie.confirm_delete_button().click() self.page.wait_loading_finished() except Exception as e: logging.error("Fail to delete movie " + movie.title() + ". " + format(e)) return movies_to_delete def to(self, page): page.to(self.driver) self.page = page def createSCListFromListModule(self, module: ListModule): list = sclist.SCList(module.id()) list.setTitle(module.title()) list.setDescription(module.description()) list.setType(None) # TODO: parse the type return list
class LegacySensCritique(object): CHANGEPAGE_TIMEOUT = 20 ''' Interact with SensCritique website ''' def __init__(self, login, password, userAgent=LINUX_USER_AGENT): ''' Constructor :param login: :param password: ''' self.login = login self.password = password dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( userAgent ) self.driver = PhantomJS(desired_capabilities=dcap) self.driver.set_window_size(1366, 768) def sign_in(self): ''' Sign-in to SensCritique using the given login details :rtype: bool :Return: true if login succeeded, false otherwise ''' self.to(HomePage()) self.page.alreadySuscribed().click() self.page.loginField().send_keys(self.login) self.page.passwordField().send_keys(self.password) self.page.submitLoginButton().click() #TODO changing page so wait or something currentUser = self.page.username(self.CHANGEPAGE_TIMEOUT) if currentUser is not None: self._currentUsername = currentUser.value() logging.warn("Logged in with user " + self._currentUsername) return True else: if self.page.loginError() is not None: logging.error("Couldn't login : "******"/") + 1:] return l def deleteList(self, l : sclist): self.to(ListCollectionPage(self._currentUsername)) for module in self.page.lists(): if l.id() in module.url(): # Alert box will be auto-accepted. Needed as Phantomjs cannot handle them self.driver.execute_script("window.confirm = function(msg) { return true; };") delete_button = module.delete_button() delete_action = ActionChains(self.driver) delete_action.move_to_element(module.title_node()) delete_action.move_to_element(delete_button) delete_action.click(delete_button) delete_action.perform() def addMovie(self, movie: Movie, l : SCList): self.to(ListPage(l)) self.page.query_input().send_keys(movie.title()) add_button = self.page.add_movie_button(0) if add_button is None: return False # Movie already in list if movie.description(): self.page.movie_description_field(0).send_keys(movie.description()) add_button.click() return True def deleteMovies(self, movies_to_delete, l : SCList): self.to(ListPage(l)) for movie in self.page.movies(): try: movies_to_delete.remove(movie.title()) delete = movie.delete_button() delete.click() movie.confirm_delete_button().click() self.page.wait_loading_finished() except Exception as e: logging.error("Fail to delete movie " + movie.title() + ". " + format(e)) return movies_to_delete def to(self, page): page.to(self.driver) self.page = page def createSCListFromListModule(self, module : ListModule): list = sclist.SCList(module.id()) list.setTitle(module.title()) list.setDescription(module.description()) list.setType(None) # TODO: parse the type return list
,但是该网址拒绝查看源代码,通过观察进一步发现给原网址添加一个view-source: 就可以出现园代码了,即view-source:http://ac.qq.com/ComicView/index/id/521825/cid/1 """ from selenium.webdriver import PhantomJS, DesiredCapabilities import time import re header = DesiredCapabilities.CHROME.copy() # DesiredCapabilities可以伪装谷歌浏览器 web = PhantomJS(desired_capabilities=header, executable_path='F:/phantomjs-2.1.1-windows/bin/phantomjs' ) # 需要设置PhantomJS的路径,否则无法运行 web.maximize_window() # 设置浏览器屏幕最大化 web.get('http://ac.qq.com/ComicView/index/id/521825/cid/1') # 获取网页 web.get_screenshot_as_file( './abc.png') # 网页截图,可以看到一个网页图片,以png的格式保存到指定位置,名称为abc.png for page in range(1, 30): # window.scrollTo(0,{})往下翻页 web.execute_script('window.scrollTo(0,{})'.format( 1080 * page)) # execute_script表示执行翻页的脚本,1080*1表示第一页,1080*2表示第二页,以此类推。。。 time.sleep(1) web.get_screenshot_as_file('./abc.png') # 下载最后一页 pat = 'https://manhua.qpic.cn/vertical/0/(.*?)"' # 通过正则获取图片地址 ls = re.compile(pat, re.S).findall(web.page_source) # web.page_source表示源代码 import urllib.request as r for i in range(len(ls)): r.urlretrieve("http://www.baidu.com", filename="F:\pa/aa.html")
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date, invoice_amount): retail_invoice_url = RETAIL_INVOICE_URL[retail] driver = PhantomJS() driver.get(retail_invoice_url) # 1 Set doc_type 'select' try: select_doc_type = Select(driver.find_element_by_name('txtTipoDte')) value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value'] select_doc_type.select_by_value(value) # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name'] # select_doc_type.select_by_visible_text(name) except Exception: print 'ERROR: set doc_type select as Boleta' driver.save_screenshot('screen.png') return '', '' time.sleep(5) # 2 Get recaptcha img url try: recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image') recaptcha_img_url = recaptcha_img.get_attribute('src') except Exception: print 'ERROR: get recaptcha image url' driver.save_screenshot('screen.png') return '', '' # 3 Solve recaptcha v = VisionApi() recaptcha_value = v.detect_text_from_url(recaptcha_img_url) if recaptcha_value is None: print 'ERROR: solving recaptcha image' driver.save_screenshot('screen.png') return '', '' # 4 Fill form script = u""" document.getElementsByName('txtFolio')[0].value = '{invoice_id}'; document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}'; document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}'; document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}'; """.format( invoice_id=invoice_id, invoice_date=invoice_date, invoice_amount=invoice_amount, recaptcha_value=recaptcha_value, ) driver.execute_script(script) # 5 Submit form try: driver.find_element_by_name('frmDatos').submit() except Exception: print 'ERROR: submitting form' driver.save_screenshot('screen.png') return '', '' # 6 Get url files try: xml_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]') pdf_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]') xml_url = xml_a_tag.get_attribute('href') pdf_url = pdf_a_tag.get_attribute('href') except Exception: print 'ERROR: getting url files' driver.save_screenshot('screen.png') return '', '' # 8 Delete driver session driver.close() driver.quit() return xml_url, pdf_url
class PagesCrawler(Spider): name = 'pages' link_extractor = RegexpLinkExtractor(canonicalize=False, deny_extensions=[]) ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS]) def __init__(self, **kwargs): mongo = MongoClient(MONGO_HOST, MONGO_PORT)[MONGO_DB][MONGO_JOBS_COL] job = mongo.find_one({"_id": kwargs["job_id"]}) args = job["crawl_arguments"] self.args = args self.start_urls = to_list(args['start_urls']) self.maxdepth = int(args['max_depth']) self.follow_prefixes = to_list(args['follow_prefixes']) self.nofollow_prefixes = to_list(args['nofollow_prefixes']) self.prefixes_trie = LRUTrie() for p in self.follow_prefixes: self.prefixes_trie.set_lru(p, True) for p in self.nofollow_prefixes: self.prefixes_trie.set_lru(p, False) self.discover_prefixes = [ url_to_lru_clean( "http%s://%s" % (https, u.replace('http://', '').replace('https://', '')), TLDS_TREE) for u in to_list(args['discover_prefixes']) for https in ['', 's'] ] self.resolved_links = {} self.user_agent = args['user_agent'] self.phantom = 'phantom' in args and args[ 'phantom'] and args['phantom'].lower() != "false" self.cookies = None if 'cookies' in args and args["cookies"]: self.cookies = dict( cookie.split('=', 1) for cookie in re.split(r'\s*;\s*', args['cookies']) if '=' in cookie) if self.phantom: self.ph_timeout = int( args.get('phantom_timeout', PHANTOM['TIMEOUT'])) self.ph_idle_timeout = int( args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT'])) self.ph_ajax_timeout = int( args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT'])) self.errors = 0 @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super(PagesCrawler, cls).from_crawler(crawler, *args, **kwargs) crawler.signals.connect(spider.spider_closed, signal=spider_closed) crawler.signals.connect(spider.spider_crashed, signal=spider_error) return spider def start_requests(self): self.log( "Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], logging.INFO) self.log("ARGUMENTS : " + str(self.args), logging.INFO) if self.phantom: self.init_phantom() for url in self.start_urls: yield self._request(url) def init_phantom(self): self.prefixfiles = os.path.join(scrapyd_config().get('logs_dir'), HYPHE_PROJECT, self.name, self.crawler.settings['JOBID']) self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, logging.INFO) phantom_args = [] if PROXY and not PROXY.startswith(':'): phantom_args.append('--proxy=%s' % PROXY) phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles) phantom_args.append('--ignore-ssl-errors=true') phantom_args.append('--load-images=false') self.capabilities = dict(DesiredCapabilities.PHANTOMJS) self.capabilities[ 'phantomjs.page.settings.userAgent'] = self.user_agent self.capabilities['takesScreenshot'] = False self.capabilities[ 'phantomjs.page.settings.javascriptCanCloseWindows'] = False self.capabilities[ 'phantomjs.page.settings.javascriptCanOpenWindows'] = False self.phantom = PhantomJS(executable_path=PHANTOM['PATH'], service_args=phantom_args, desired_capabilities=self.capabilities, service_log_path="%s-phantomjs.log" % self.prefixfiles) self.phantom.implicitly_wait(10) self.phantom.set_page_load_timeout(60) self.phantom.set_script_timeout(self.ph_timeout + 15) def spider_crashed(self, spider): self.errors += 1 self.spider_closed(spider, reason="CRASH") def spider_closed(self, spider, reason=""): if self.errors: self.log( "%s error%s encountered during the crawl (%s)." % (self.errors, 's' if self.errors > 1 else '', reason), logging.ERROR) if self.phantom: self.phantom.quit() if not self.errors: for f in ["phantomjs-cookie.txt", "phantomjs.log"]: fi = "%s-%s" % (self.prefixfiles, f) if os.path.exists(fi) and not self.errors: os.remove(fi) def handle_response(self, response): lru = url_to_lru_clean(response.url, TLDS_TREE) if self.phantom: self.phantom.get(response.url) # Collect whole DOM of the webpage including embedded iframes with open( os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js: get_bod_w_iframes = js.read() bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Try to scroll and unfold page self.log("Start PhantomJS scrolling and unfolding", logging.INFO) with open( os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js: try: signal.signal(signal.SIGALRM, timeout_alarm) signal.alarm(self.ph_timeout + 30) timedout = self.phantom.execute_async_script( js.read(), self.ph_timeout, self.ph_idle_timeout, self.ph_ajax_timeout) signal.alarm(0) if timedout: raise SeleniumTimeout self.log("Scrolling/Unfolding finished", logging.INFO) except SeleniumTimeout: self.log( "Scrolling/Unfolding timed-out (%ss)" % self.ph_timeout, logging.WARNING) self.errors += 1 except WebDriverException as e: err = json.loads(e.msg)['errorMessage'] self.log("Scrolling/Unfolding crashed: %s" % err, logging.ERROR) self.errors += 1 except Exception as e: self.log( "Scrolling/Unfolding crashed: %s %s" % (type(e), e), logging.ERROR) self.errors += 1 return self._make_raw_page(response, lru) bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses if response.status == 200 and not isinstance(response, HtmlResponse): try: flags = response.flags if "partial" in flags: flags.remove("partial") flags.append("cleaned") response = HtmlResponse(response.url, headers=response.headers, body=cleanupbase64images( response.body), flags=flags, request=response.request) self.log( "WARNING: page with base64 embedded images was cleaned-up for links extraction" ) except: pass if 300 < response.status < 400 or isinstance(response, HtmlResponse): return self.parse_html(response, lru) else: return self._make_raw_page(response, lru) def handle_error(self, failure, response=None): if response: p = self._make_raw_page(response, failure.request.url) p['error'] = error_name(failure.value) return p elif not "://www" in failure.request.url: return self._request(failure.request.url.replace('://', '://www.')) error = failure.getErrorMessage() self.log("ERROR : %s" % error, logging.ERROR) if PROXY and not PROXY.startswith( ':') and "OpenSSL.SSL.Error" in error: return self._request(failure.request.url, noproxy=True) self.errors += 1 return def parse_html(self, response, lru): lrulinks = [] # handle redirects realdepth = response.meta['depth'] if 300 < response.status < 400: redir_url = response.headers['Location'] if redir_url.startswith('/'): redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url) elif redir_url.startswith('../'): lrustart = lru[:lru.rfind('|p:')] while redir_url.startswith('../'): lrustart = lrustart[:lrustart.rfind('|p:')] redir_url = redir_url[3:] redir_url = "%s/%s" % (lru_to_url(lrustart + '|'), redir_url) elif redir_url.startswith( './') or not redir_url.startswith('http'): redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:]) links = [{'url': redir_url}] response.meta['depth'] -= 1 else: try: links = self.link_extractor.extract_links(response) except Exception as e: self.log( "ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), logging.ERROR) links = [] self.errors += 1 for link in links: try: url = link.url except AttributeError: url = link['url'] try: lrulink = url_to_lru_clean(url, TLDS_TREE) except (ValueError, IndexError) as e: self.log("Error converting URL %s to LRU: %s" % (url, e), logging.ERROR) continue lrulinks.append((url, lrulink)) if self._should_follow(response.meta['depth'], lrulink) and \ not url_has_any_extension(url, self.ignored_exts): yield self._request(url) response.meta['depth'] = realdepth yield self._make_html_page(response, lru, lrulinks) def _make_html_page(self, response, lru, lrulinks): p = self._make_raw_page(response, lru) if STORE_HTML: p['body'] = Binary(response.body.encode('zip')) p['lrulinks'] = lrulinks return p def _make_raw_page(self, response, lru): p = self._new_page(response.url, lru) p['status'] = response.status p['size'] = len(response.body) if isinstance(response, HtmlResponse): p['encoding'] = response.encoding if response.meta.get('depth'): p['depth'] = response.meta['depth'] if response.headers.get('content-type'): p['content_type'] = response.headers.get('content-type').partition( ';')[0] p['error'] = None return p def _new_page(self, url, lru=None): if lru is None: lru = url_to_lru_clean(url, TLDS_TREE) p = Page() p['url'] = url p['lru'] = lru p['depth'] = 0 p['timestamp'] = int(time.time() * 1000) return p def _should_follow(self, depth, tolru): c1 = depth < self.maxdepth c2 = self.prefixes_trie.match_lru(tolru) return c1 and c2 def _request(self, url, noproxy=False, **kw): kw['meta'] = {'handle_httpstatus_all': True, 'noproxy': noproxy} kw['callback'] = self.handle_response kw['errback'] = self.handle_error if self.cookies: kw['cookies'] = self.cookies if self.phantom: kw['method'] = 'HEAD' return Request(url, **kw)
class PagesCrawler(BaseSpider): name = 'pages' link_extractor = RegexpLinkExtractor(canonicalize=False, deny_extensions=[]) ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS]) def __init__(self, **kw): args = DEFAULT_INPUT.copy() args.update(kw) self.args = args self.start_urls = to_list(args['start_urls']) self.maxdepth = int(args['maxdepth']) self.follow_prefixes = to_list(args['follow_prefixes']) self.nofollow_prefixes = to_list(args['nofollow_prefixes']) self.discover_prefixes = [ url_to_lru_clean( "http%s://%s" % (https, u.replace('http://', '').replace('https://', ''))) for u in to_list(args['discover_prefixes']) for https in ['', 's'] ] self.resolved_links = {} self.user_agent = args['user_agent'] self.phantom = 'phantom' in args and args[ 'phantom'] and args['phantom'].lower() != "false" if self.phantom: self.ph_timeout = int( args.get('phantom_timeout', PHANTOM['TIMEOUT'])) self.ph_idle_timeout = int( args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT'])) self.ph_ajax_timeout = int( args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT'])) self.errors = 0 dispatcher.connect(self.closed, spider_closed) dispatcher.connect(self.crashed, spider_error) def start_requests(self): self.log( "Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], log.INFO) self.log("ARGUMENTS : " + str(self.args), log.INFO) if self.phantom: self.init_phantom() for url in self.start_urls: yield self._request(url) def init_phantom(self): self.prefixfiles = os.path.join(scrapyd_config().get('logs_dir'), HYPHE_PROJECT, self.name, self.crawler.settings['JOBID']) self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, log.INFO) phantom_args = [] if PROXY and not PROXY.startswith(':'): phantom_args.append('--proxy=%s' % PROXY) phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles) phantom_args.append('--ignore-ssl-errors=true') phantom_args.append('--load-images=false') self.capabilities = dict(DesiredCapabilities.PHANTOMJS) self.capabilities[ 'phantomjs.page.settings.userAgent'] = self.user_agent self.capabilities['takesScreenshot'] = False self.capabilities[ 'phantomjs.page.settings.javascriptCanCloseWindows'] = False self.capabilities[ 'phantomjs.page.settings.javascriptCanOpenWindows'] = False self.phantom = PhantomJS(executable_path=PHANTOM['PATH'], service_args=phantom_args, desired_capabilities=self.capabilities, service_log_path="%s-phantomjs.log" % self.prefixfiles) self.phantom.implicitly_wait(10) self.phantom.set_page_load_timeout(60) self.phantom.set_script_timeout(self.ph_timeout + 15) def crashed(self, spider): self.errors += 1 self.closed("CRASH") def closed(self, reason): if self.errors: self.log( "%s error%s encountered during the crawl." % (self.errors, 's' if self.errors > 1 else ''), log.ERROR) if self.phantom: self.phantom.quit() if not self.errors: for f in ["phantomjs-cookie.txt", "phantomjs.log"]: fi = "%s-%s" % (self.prefixfiles, f) if os.path.exists(fi) and not self.errors: os.remove(fi) def handle_response(self, response): lru = url_to_lru_clean(response.url) if self.phantom: self.phantom.get(response.url) # Collect whole DOM of the webpage including embedded iframes with open( os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js: get_bod_w_iframes = js.read() bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Try to scroll and unfold page self.log("Start PhantomJS scrolling and unfolding", log.INFO) with open( os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js: try: signal.signal(signal.SIGALRM, timeout_alarm) signal.alarm(self.ph_timeout + 30) timedout = self.phantom.execute_async_script( js.read(), self.ph_timeout, self.ph_idle_timeout, self.ph_ajax_timeout) signal.alarm(0) if timedout: raise SeleniumTimeout self.log("Scrolling/Unfolding finished", log.INFO) except SeleniumTimeout: self.log( "Scrolling/Unfolding timed-out (%ss)" % self.ph_timeout, log.WARNING) self.errors += 1 except WebDriverException as e: err = json.loads(e.msg)['errorMessage'] self.log("Scrolling/Unfolding crashed: %s" % err, log.ERROR) self.errors += 1 except Exception as e: self.log( "Scrolling/Unfolding crashed: %s %s" % (type(e), e), log.ERROR) self.errors += 1 return self._make_raw_page(response, lru) bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses if response.status == 200 and not isinstance(response, HtmlResponse): try: flags = response.flags if "partial" in flags: flags.remove("partial") flags.append("cleaned") response = HtmlResponse(response.url, headers=response.headers, body=cleanupbase64images( response.body), flags=flags, request=response.request) self.log( "WARNING: page with base64 embedded images was cleaned-up for links extraction" ) except: pass if 300 < response.status < 400 or isinstance(response, HtmlResponse): return self.parse_html(response, lru) else: return self._make_raw_page(response, lru) def handle_error(self, failure, response=None): if response: p = self._make_raw_page(response, failure.request.url) p['error'] = error_name(failure.value) return p elif not "://www" in failure.request.url: return self._request(failure.request.url.replace('://', '://www.')) error = failure.getErrorMessage() self.log("ERROR : %s" % error, log.ERROR) if PROXY and not PROXY.startswith( ':') and "OpenSSL.SSL.Error" in error: return self._request(failure.request.url, noproxy=True) self.errors += 1 return def parse_html(self, response, lru): lrulinks = [] # handle redirects realdepth = response.meta['depth'] if 300 < response.status < 400: redir_url = response.headers['Location'] if redir_url.startswith('/'): redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url) elif redir_url.startswith( './') or not redir_url.startswith('http'): redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:]) links = [{'url': redir_url}] response.meta['depth'] -= 1 else: try: links = self.link_extractor.extract_links(response) except Exception as e: self.log( "ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), log.ERROR) links = [] self.errors += 1 for link in links: try: url = link.url except AttributeError: url = link['url'] try: lrulink = url_to_lru_clean(url) except ValueError, e: self.log("Error converting URL %s to LRU: %s" % (url, e), log.ERROR) continue lrulinks.append((url, lrulink)) if self._should_follow(response.meta['depth'], lru, lrulink) and \ not url_has_any_extension(url, self.ignored_exts): yield self._request(url) response.meta['depth'] = realdepth yield self._make_html_page(response, lru, lrulinks)
class PagesCrawler(BaseSpider): name = 'pages' link_extractor = RegexpLinkExtractor(canonicalize=False, deny_extensions=[]) ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS]) def __init__(self, **kw): args = DEFAULT_INPUT.copy() args.update(kw) self.args = args self.start_urls = to_list(args['start_urls']) self.maxdepth = int(args['maxdepth']) self.follow_prefixes = to_list(args['follow_prefixes']) self.nofollow_prefixes = to_list(args['nofollow_prefixes']) self.discover_prefixes = [url_to_lru_clean("http%s://%s" % (https, u.replace('http://', '').replace('https://', ''))) for u in to_list(args['discover_prefixes']) for https in ['', 's']] self.resolved_links = {} self.user_agent = args['user_agent'] self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false" if self.phantom: self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT'])) self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT'])) self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT'])) self.errors = 0 dispatcher.connect(self.closed, spider_closed) dispatcher.connect(self.crashed, spider_error) def start_requests(self): self.log("Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], log.INFO) self.log("ARGUMENTS : "+str(self.args), log.INFO) if self.phantom: self.init_phantom() for url in self.start_urls: yield self._request(url) def init_phantom(self): self.prefixfiles = os.path.join( scrapyd_config().get('logs_dir'), HYPHE_PROJECT, self.name, self.crawler.settings['JOBID'] ) self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, log.INFO) phantom_args = [] if PROXY and not PROXY.startswith(':'): phantom_args.append('--proxy=%s' % PROXY) phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles) phantom_args.append('--ignore-ssl-errors=true') phantom_args.append('--load-images=false') self.capabilities = dict(DesiredCapabilities.PHANTOMJS) self.capabilities['phantomjs.page.settings.userAgent'] = self.user_agent self.capabilities['takesScreenshot'] = False self.capabilities['phantomjs.page.settings.javascriptCanCloseWindows'] = False self.capabilities['phantomjs.page.settings.javascriptCanOpenWindows'] = False self.phantom = PhantomJS( executable_path=PHANTOM['PATH'], service_args=phantom_args, desired_capabilities=self.capabilities, service_log_path="%s-phantomjs.log" % self.prefixfiles ) self.phantom.implicitly_wait(10) self.phantom.set_page_load_timeout(60) self.phantom.set_script_timeout(self.ph_timeout + 15) def crashed(self, spider): self.errors += 1 self.closed("CRASH") def closed(self, reason): if self.errors: self.log("%s error%s encountered during the crawl." % (self.errors, 's' if self.errors > 1 else ''), log.ERROR) if self.phantom: self.phantom.quit() if not self.errors: for f in ["phantomjs-cookie.txt", "phantomjs.log"]: fi = "%s-%s" % (self.prefixfiles, f) if os.path.exists(fi) and not self.errors: os.remove(fi) def handle_response(self, response): lru = url_to_lru_clean(response.url) if self.phantom: self.phantom.get(response.url) # Collect whole DOM of the webpage including embedded iframes with open(os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js: get_bod_w_iframes = js.read() bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Try to scroll and unfold page self.log("Start PhantomJS scrolling and unfolding", log.INFO) with open(os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js: try: signal.signal(signal.SIGALRM, timeout_alarm) signal.alarm(self.ph_timeout + 30) timedout = self.phantom.execute_async_script( js.read(), self.ph_timeout, self.ph_idle_timeout, self.ph_ajax_timeout) signal.alarm(0) if timedout: raise SeleniumTimeout self.log("Scrolling/Unfolding finished", log.INFO) except SeleniumTimeout: self.log("Scrolling/Unfolding timed-out (%ss)" % self.ph_timeout, log.WARNING) self.errors += 1 except WebDriverException as e: err = json.loads(e.msg)['errorMessage'] self.log("Scrolling/Unfolding crashed: %s" % err, log.ERROR) self.errors += 1 except Exception as e: self.log("Scrolling/Unfolding crashed: %s %s" % (type(e), e), log.ERROR) self.errors += 1 return self._make_raw_page(response, lru) bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses if response.status == 200 and not isinstance(response, HtmlResponse): try: flags = response.flags if "partial" in flags: flags.remove("partial") flags.append("cleaned") response = HtmlResponse(response.url, headers=response.headers, body=cleanupbase64images(response.body), flags=flags, request=response.request) self.log("WARNING: page with base64 embedded images was cleaned-up for links extraction") except: pass if 300 < response.status < 400 or isinstance(response, HtmlResponse): return self.parse_html(response, lru) else: return self._make_raw_page(response, lru) def handle_error(self, failure, response=None): if response: p = self._make_raw_page(response, failure.request.url) p['error'] = error_name(failure.value) return p elif not "://www" in failure.request.url: return self._request(failure.request.url.replace('://', '://www.')) self.log("ERROR : %s" % failure.getErrorMessage(), log.ERROR) self.errors += 1 return def parse_html(self, response, lru): lrulinks = [] # handle redirects realdepth = response.meta['depth'] if 300 < response.status < 400: redir_url = response.headers['Location'] if redir_url.startswith('/'): redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url) elif redir_url.startswith('./') or not redir_url.startswith('http'): redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:]) links = [{'url': redir_url}] response.meta['depth'] -= 1 else: try: links = self.link_extractor.extract_links(response) except Exception as e: self.log("ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), log.ERROR) links = [] self.errors += 1 for link in links: try: url = link.url except AttributeError: url = link['url'] try: lrulink = url_to_lru_clean(url) except ValueError, e: self.log("Error converting URL %s to LRU: %s" % (url, e), log.ERROR) continue lrulinks.append((url, lrulink)) if self._should_follow(response.meta['depth'], lru, lrulink) and \ not url_has_any_extension(url, self.ignored_exts): yield self._request(url) response.meta['depth'] = realdepth yield self._make_html_page(response, lru, lrulinks)
class CamaraCGCrawler(object): """ Camara CG Ementa Crawler """ def __init__(self, starting_year): self.base_url = "http://187.115.174.90:8080/ScanLexWeb" self.starting_year = starting_year self.browser = None @staticmethod def get_ementa_id(published_date, ementa_type, ementa_doc_number, ementa_situation): """ Return the Ementa Unique Id """ return "%s#%s#%s#%s" % (datetime.strftime( published_date, "%Y-%m-%d"), ementa_type, ementa_doc_number, ementa_situation) def get_all_ementas_summary(self): """ Yield the next ementa information row """ browser_table = self.browser.find_element_by_id( "frmMenu:tabEmentas_data") bs_ementa_table = BeautifulSoup( browser_table.get_attribute("innerHTML")) for row in bs_ementa_table.find_all("tr"): cols = row.find_all("td") if len(cols) == 6: published_date = datetime.strptime( cols[0].span.text.encode("utf-8"), "%d/%m/%Y") doc_number = int(cols[1].span.text.encode("utf-8")) title = cols[2].span.text.encode("utf-8") ementa_type = cols[3].span.text.encode("utf-8") ementa_situation = cols[4].span.text.encode("utf-8") details_js = cols[5].a['onclick'].encode("utf-8") if published_date > datetime.now(): continue yield published_date, doc_number, title, ementa_type, ementa_situation, details_js def get_ementa_details(self, ementa_details_js): """ Crawl the second ementa page """ # Waiting... _ = WebDriverWait(self.browser, 30).until( EC.visibility_of_element_located( (By.ID, "frmfuncao:j_idt13_content"))) _ = WebDriverWait(self.browser, 30).until( EC.visibility_of_element_located( (By.ID, "frmfuncao:tabProponentes"))) # Get Ementail Details bs_ementa_details = BeautifulSoup(self.browser \ .find_element_by_id("frmfuncao:j_idt13_content").get_attribute("innerHTML")) rows = bs_ementa_details.find_all("tr") source = rows[3].td.text main_theme = rows[7].td.text sys_enter_date = datetime.strptime(rows[9].td.text, "%d/%m/%Y") approval_date = datetime.strptime(rows[11].td.text, "%d/%m/%Y") process_number = int(rows[15].td.text or "-1") autograph_number = int(rows[19].td.text or "-1") process_year = int(rows[21].td.text or "-1") has_image = rows[23].td.text == "Sim" # Get Proponent names bs_proponent = BeautifulSoup( self.browser.find_element_by_id( "frmfuncao:tabProponentes").get_attribute("innerHTML")) proponents = ",".join( [col.text for col in bs_proponent.find_all("td")]) return source, proponents, main_theme, sys_enter_date, approval_date, process_number, \ autograph_number, process_year, has_image def next_ementa(self, select_curs): """ Iterate in the years onwards and collect all the ementas """ try: LOGGER.info("Opening Browser") self.browser = PhantomJS() LOGGER.info("GET [%s]", self.base_url) self.browser.maximize_window() cur_year = int(datetime.now().year) # Define the initial collection year select_curs.execute( "SELECT EXTRACT (YEAR FROM MAX(published_date)) FROM ementas;") last_exec_year = select_curs.fetchone() if last_exec_year: collection_year = max(self.starting_year, last_exec_year[0]) else: collection_year = self.starting_year all_proponents = [ "ANDERSON MAIA", "Afonso Alexandre Régis", "Alcides Cavalcante", "Alcindor Villarim", "Aldo Cabral", "Alexandre do Sindicato", "Antonio Pereira", "Antônio Alves Pimentel Filho", "Aragão Júnior", "Bruno Cunha Lima Branco", "Bruno Gaudêncio", "Buchada", "Cassiano Pascoal", "Cozete Babosa", "Cássio Murilo Galdino de Araujo", "Daniella Ribeiro", "Dr. Nunes", "Executivo", "Fabrinni Brito", "Fernando carvalho", "Francisco Dantas Lira", "Galego do Leite", "Inacio Falcao", "Ivan Batista", "Ivonete Ludgerio", "Joao Dantas", "Josimar Henrique da Silva", "José Marcos Raia ", "José Ribamar", "João Dantas", "Jóia Germano", "Laelson Patricio", "Lafite", "Lindaci Medeiros Nápolis", "Lourdes Costa", "Lula Cabral", "Marcos Marinho", "Maria Lopes Barbosa", "Marinaldo Cardoso", "Metuselá Agra", "Miguel Rodrigues da Silva", "Miguel da Construção", "Napoleão Maracajá", "Nelson Gomes Filho", "Olimpio Oliveira", "Orlandino Farias", "Paulo Muniz", "Paulo de Tarso", "Peron Ribeiro Japiassú", "Renato Feliciano", "Rodolfo Rodrigues", "Rodrigo Ramos Victor", "Romero Rodrigues", "Rostand Paraíba", "Rômulo Gouveia", "Saulo Germano", "Saulo Noronha", "Tia Mila", "Tovar Correia Lima", "Vaninho Aragão", "Veneziano Vital do rego", "Walter Brito Neto", "Todos" ] while collection_year <= cur_year: for i_prop in range(len(all_proponents)): ementa_prop = all_proponents[i_prop].decode("utf-8") self.browser.get(self.base_url) # Waiting... WebDriverWait(self.browser, 30).until( EC.element_to_be_clickable((By.ID, "frmMenu:button1"))) LOGGER.info("Collecting Ementas from [%d][%s - %d/%d]", collection_year, ementa_prop, i_prop + 1, len(all_proponents)) # Set Year year_field = self.browser.find_element_by_id("frmMenu:ano") year_field.send_keys(collection_year) # Set Proponent proponent_field = self.browser.find_element_by_id( "frmMenu:autoridade") proponent_field.send_keys(ementa_prop) # Submit the form self.browser.find_element_by_id("frmMenu:button1").click() # Waiting... # _ = WebDriverWait(self.browser, 60).until(EC.visibility_of_element_located((By.ID, "frmMenu:tabEmentas_data"))) time.sleep(3) for published_date, document_number, title, ementa_type, ementa_situation, ementa_details_js in self.get_all_ementas_summary( ): ementa_id = self.get_ementa_id(published_date, ementa_type, document_number, ementa_situation) select_curs.execute(""" SELECT ementa_id FROM ementas WHERE ementa_id = '%s'; """ % ementa_id) if not select_curs.fetchone(): # Run the details script self.browser.execute_script(ementa_details_js) ementa_source, proponents, main_theme, sys_enter_date, approval_date, \ process_number, autograph_number, process_year, has_image = self.get_ementa_details(ementa_details_js) # Come back to the table page self.browser.back() # Waiting... _ = WebDriverWait(self.browser, 60).until( EC.visibility_of_element_located( (By.ID, "frmMenu:tabEmentas_data"))) yield ementa_id, published_date, ementa_type, document_number, title, \ ementa_source, proponents, ementa_situation, main_theme, sys_enter_date, \ approval_date, process_number, autograph_number, process_year, has_image LOGGER.info("DONE [%d]", collection_year) self.browser.back() collection_year += 1 finally: if self.browser: self.browser.quit()