def _init_robot(self, id): robot = WDriver() logging.debug("initialize") self.robots.update({str(id): robot}) logging.debug("get facebook.com") robot.get('http://fb.com') logging.debug("login") robot.find_element_by_name('email').send_keys('*****@*****.**') robot.find_element_by_name('pass').send_keys('2855930022040') robot.find_element_by_name('pass').send_keys(Keys.RETURN) for index in range(len(self.remain_ids)): self.lock.acquire() user_id = self.remain_ids.pop() self.lock.release() try: self.get_name_for_id(robot, user_id) except: logging.debug("error while updating record with id=%s" % str(user_id)) self.error_ids.add(user_id) else: self.done_ids.add(user_id) robot.close() return
def main(argv=sys.argv[1:]): parser = argparse.ArgumentParser() parser.add_argument('--url', default='http://127.0.0.1:8000/static/index.html') args = parser.parse_args(argv) url = args.url browser = WebDriver() browser.get(url) tags = browser.find_elements_by_css_selector('li') for tag in tags: print(tag.text) browser.close()
class Client: def __init__(self, ig_id): self.b = PhantomJS() self.ig_id = ig_id self.b.get('https://instagram.com/%s' % ig_id) def close(self): self.b.close() def get_media(self) -> list: js = self.b.execute_script('return window._sharedData;') ed = js['entry_data'] pp = ed['PostPage'][0] g = pp['graphql'] sc = g['shortcode_media'] if sc['__typename'] == 'GraphSidecar': edges = sc['edge_sidecar_to_children']['edges'] medias = list( map( lambda x: { 'id': x['node']['id'], 'url': x['node']['display_url'], 'caption': x['node']['accessibility_caption'] }, edges)) elif sc['__typename'] == 'GraphImage': medias = [{ 'id': sc['id'], 'url': sc['display_url'], 'caption': sc['accessibility_caption'] }] return list( filter( lambda x: 'person' in x['caption'] or 'people' in x['caption'], medias)) def get_user(self) -> dict: js = self.b.execute_script('return window._sharedData;') ed = js['entry_data'] pp = ed['ProfilePage'][0] g = pp['graphql'] return g['user'] def get_posts(self) -> set: ps = self.b.find_elements_by_css_selector('a[href^="/p/"]') return set(map(lambda x: x.get_attribute('href'), ps)) def scroll(self): self.b.execute_script('window.scroll(0, document.body.scrollHeight);')
def post(self): id = request.values['page'] page = Page.objects.get_or_404(id=id) # html = requests.get(page.baseurl).text screenshot = None try: phantom = PhantomJS(desired_capabilities={'acceptSslCerts': True}, service_args=['--web-security=false', '--ssl-protocol=any', '--ignore-ssl-errors=true'], port=8888) phantom.set_window_size(1024, 768) phantom.get(page.baseurl) html = phantom.page_source screenshot = phantom.get_screenshot_as_png() phantom.close() except Exception as ex: html = "error when i snap your page ... %s" % ex snap = Snap(html, datetime.datetime.now(), screenshot).save() page.update(push__snaps=snap) snap = Snap(html, datetime.datetime.now(), screenshot).save() page.update(push__snaps=snap) return jsonify({'id': "%s" % snap.id})
def get_video_src(self): #room url check o = urlparse(self.room_url) rule_key = o.netloc if rule_key in self.rules: cap = webdriver.DesiredCapabilities.PHANTOMJS cap["phantomjs.page.settings.resourceTimeout"] = 1000 cap["phantomjs.page.settings.loadImages"] = True cap["phantomjs.page.settings.disk-cache"] = True cap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0(iPhone;CPU iPhone OS 9_1 like Mac OSX) AppleWebKit / 601.1" ".46(KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1" driver = PhantomJS(self.driverPath["PhantomJS"], desired_capabilities=cap) # 指定使用的浏览器 # driver = webdriver.Firefox() driver.implicitly_wait(10) my_rule = self.rules[rule_key] url_prefix = my_rule["url_prefix"] driver.get("%s%s" % (url_prefix, o.path)) try: result_video = driver.find_element_by_tag_name( 'video').get_attribute('src') driver.close() return result_video except: # return "未能获得该直播间地址直播流地址" driver.close() return "主播不在了" # return " " else: return "不支持的网站(not support url)"
df.loc[ind, 'subject'] = subject df.loc[ind, 'category'] = r_list[0] df.loc[ind, 'school_score'] = r_list[ 1] if not extra_value else r_list[2] df.loc[ind, 'state_average'] = r_list[ 2] if not extra_value else r_list[3] ind += 1 df['school_score'] = df['school_score'].str.replace(r'[^0-9]', '').astype('int') df['state_average'] = df['state_average'].str.replace(r'[^0-9]', '').astype('int') df['test_diff'] = df['school_score'] - df['state_average'] subject_diffs = df.groupby('subject')['test_diff'].mean() output_df.loc[output_ind, 'school'] = school_name output_df.loc[output_ind, 'url'] = url output_df.loc[output_ind, 'students_per_grade'] = students_per_grade output_df.loc[output_ind, 'teachers_to_student'] = t_to_s_school output_df.loc[output_ind, 'counselors_to_student'] = c_to_s_school if 'Reading' in subject_diffs.index: output_df.loc[output_ind, 'reading'] = subject_diffs['Reading'] if 'Math' in subject_diffs.index: output_df.loc[output_ind, 'math'] = subject_diffs['Math'] if 'Science' in subject_diffs.index: output_df.loc[output_ind, 'science'] = subject_diffs['Science'] output_ind += 1 print time() - t1 wd.close()
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date, invoice_amount): retail_invoice_url = RETAIL_INVOICE_URL[retail] driver = PhantomJS() driver.get(retail_invoice_url) # 1 Set doc_type 'select' try: select_doc_type = Select(driver.find_element_by_name('txtTipoDte')) value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value'] select_doc_type.select_by_value(value) # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name'] # select_doc_type.select_by_visible_text(name) except Exception: print 'ERROR: set doc_type select as Boleta' driver.save_screenshot('screen.png') return '', '' time.sleep(5) # 2 Get recaptcha img url try: recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image') recaptcha_img_url = recaptcha_img.get_attribute('src') except Exception: print 'ERROR: get recaptcha image url' driver.save_screenshot('screen.png') return '', '' # 3 Solve recaptcha v = VisionApi() recaptcha_value = v.detect_text_from_url(recaptcha_img_url) if recaptcha_value is None: print 'ERROR: solving recaptcha image' driver.save_screenshot('screen.png') return '', '' # 4 Fill form script = u""" document.getElementsByName('txtFolio')[0].value = '{invoice_id}'; document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}'; document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}'; document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}'; """.format( invoice_id=invoice_id, invoice_date=invoice_date, invoice_amount=invoice_amount, recaptcha_value=recaptcha_value, ) driver.execute_script(script) # 5 Submit form try: driver.find_element_by_name('frmDatos').submit() except Exception: print 'ERROR: submitting form' driver.save_screenshot('screen.png') return '', '' # 6 Get url files try: xml_a_tag = driver.find_element_by_xpath( '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]') pdf_a_tag = driver.find_element_by_xpath( '//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]') xml_url = xml_a_tag.get_attribute('href') pdf_url = pdf_a_tag.get_attribute('href') except Exception: print 'ERROR: getting url files' driver.save_screenshot('screen.png') return '', '' # 8 Delete driver session driver.close() driver.quit() return xml_url, pdf_url
parser.add_argument('--reason', type=str, help='出校原因, eg. 吃饭', default='上课 返回宿舍') parser.add_argument('--destination', type=str, help='出校目的地, eg. 北京', default='北京') parser.add_argument('--track', type=str, help='出校轨迹, eg. 畅春园食堂', default='东南门-理教-勺园—东南门') parser.add_argument('--habitation', type=str, help='入校前居住地, eg. 北京', default='北京') parser.add_argument('--district', type=str, help='入校前居住所在区, eg. 海淀区', default='海淀区') parser.add_argument('--street', type=str, help='入校前居住所在街道, eg. 燕园街道', default='燕园街道') args = parser.parse_args() args_public = copy.deepcopy(args) args_public.password = '******' print('Arguments: {}'.format(args_public)) print('Driver Launching...') # driver = Firefox() # driver = Chrome() if sys.platform == 'darwin': # macOS phantomjs_path = os.path.join('phantomjs', 'phantomjs-darwin') elif sys.platform == 'linux': # linux phantomjs_path = os.path.join('phantomjs', 'phantomjs-linux-x86_64') else: # windows phantomjs_path = os.path.join('phantomjs', 'phantomjs-windows.exe') driver = PhantomJS(executable_path=phantomjs_path) run(driver, args.username, args.password, args.campus, args.reason, args.destination, args.track, args.habitation, args.district, args.street) driver.close()
class Client(object): """Client HTTP pour tester fonctionnellement Strass Adapteur du pilote Selenium, avec une interface inspirée de Nightwatch.js, et quelques paramètres spécifiques à Strass.""" def __init__(self): self.driver = PhantomJS() self.driver.set_window_size(1120, 550) def __del__(self): self.driver.quit() def get(self, query=None): server = os.environ.get('STRASS_TEST_SERVER', 'http://localhost:8000') url = server + (query or '/') self.driver.get(url) return self def find(self, selector): return self.driver.find_element_by_css_selector(selector) def click(self, selector): self.find(selector).click() return self def fill(self, selector, value): if isinstance(value, datetime.date): self.fill(selector + ' input.day', str(value.day)) self.fill(selector + ' input.month', str(value.month)) self.fill(selector + ' input.year', str(value.year)) else: control = self.find(selector) try: control.clear() except selexc.InvalidElementStateException: # On doit tenter de nettoyer un input[type=file]. On zap. pass control.send_keys(value) return self def select(self, selector, value): Select(self.find(selector)).select_by_value(value) return self def submit(self, selector='#document button[type=submit]'): return self.click(selector) def close(self): self.driver.close() if self.driver.window_handles: self.driver.switch_to.window(self.driver.window_handles[0]) self.driver.set_window_size(1120, 550) return self def screenshot(self, filename): self.driver.get_screenshot_as_file(filename) sys.stderr.write("Capture d'écran enregistrée dans %r\n" % (filename,)) return self def save(self, filename): with open(filename, 'w') as fo: fo.write(self.driver.page_source) sys.stderr.write("HTML enregistré dans %r\n" % (filename,)) return self def __getattr__(self, name): return getattr(self.driver, name)
extra_value = any(['% of students' in r for r in r_list]) if r_list[0] in ('White', 'All students', 'Not low-income'): continue df.loc[ind, 'indicator'] = indicator.replace('Equity', '') df.loc[ind, 'subject'] = subject df.loc[ind, 'category'] = r_list[0] df.loc[ind, 'school_score'] = r_list[1] if not extra_value else r_list[2] df.loc[ind, 'state_average'] = r_list[2] if not extra_value else r_list[3] ind += 1 df['school_score'] = df['school_score'].str.replace(r'[^0-9]', '').astype('int') df['state_average'] = df['state_average'].str.replace(r'[^0-9]', '').astype('int') df['test_diff'] = df['school_score'] - df['state_average'] subject_diffs = df.groupby('subject')['test_diff'].mean() output_df.loc[output_ind, 'school'] = school_name output_df.loc[output_ind, 'url'] = url output_df.loc[output_ind, 'students_per_grade'] = students_per_grade output_df.loc[output_ind, 'teachers_to_student'] = t_to_s_school output_df.loc[output_ind, 'counselors_to_student'] = c_to_s_school if 'Reading' in subject_diffs.index: output_df.loc[output_ind, 'reading'] = subject_diffs['Reading'] if 'Math' in subject_diffs.index: output_df.loc[output_ind, 'math'] = subject_diffs['Math'] if 'Science' in subject_diffs.index: output_df.loc[output_ind, 'science'] = subject_diffs['Science'] output_ind += 1 print time() - t1 wd.close()
class WeixinPhantomjs(Base): all_uids = {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs} def __init__(self): self.start_page = START_PAGE self.end_page = END_PAGE self.weixin_url = REFER_FIRST # self.driver = Firefox() if hasattr(config, 'PHANTOMJS_PATH'): self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH')) else: self.driver = PhantomJS() def open_weixin_browser(self, word): try: self.driver.get(self.weixin_url) self.driver.set_page_load_timeout(3) self.driver.find_element_by_id('upquery').send_keys(word) self.driver.find_element_by_class_name('swz').click() time.sleep(3) urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() except Exception as e: storage_word.append([word, 0]) self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e)) self.close_browser() return True return False def get_total_pages_to_word(self): pages = [] page_id_css = 'pagebar_container' try: e = self.driver.find_element_by_id(page_id_css) for _p in e.text.split(): _p = _p.strip() if not _p.isdigit(): return DEFAULT_PAGES if DEFAULT_PAGES <= pages[-1] else pages[-1] else: pages.append(int(_p)) return 1 except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError): pass return 1 def extract_urls_uids(self, word): urls_uids = [] timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')] urls_tits = [(t.get_attribute('href'), self.trim(t.text)) for t in self.driver.find_elements_by_css_selector('h4 a')] if len(urls_tits) != len(timestamp): return urls_uids for index, url_tit in enumerate(urls_tits): try: uid = self.md5(timestamp[index] + url_tit[1] + word) if uid not in self.__class__.all_uids: self.__class__.all_uids.add(uid) urls_uids.append({'url': url_tit[0], 'uid': uid}) except (TypeError, IndexError): pass return urls_uids @property def is_forbidden(self): css_id = 'seccodeForm' try: if self.driver.find_element_by_id(css_id): return True except NoSuchElementException: pass return False def appear_element(self, by): try: # Have `click` function to specified element tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by)) tag.click() return True except (TimeoutException, NoSuchWindowException, NoSuchElementException): pass return False def crawl_single(self, word=None, go=0): is_go = True go_page = int(go) next_page_css = 'sogou_page_%s' is_break = self.open_weixin_browser(word) pages = self.get_total_pages_to_word() for page in range(self.start_page + 1, (pages or self.end_page) + 1): if is_go and page < go_page: continue else: is_go = False if not self.appear_element(by=next_page_css % page): is_break = True msg = '\tNot appear next page element, will break' elif self.is_forbidden: is_break = True msg = '\tSpider was forbidden, crawling again after sleeping a moment!' if is_break: storage_word.append([word, page]) self.logger.info(msg) break urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() # self.driver.find_element_by_id(next_page_css % page).click() # wt = randint(10, 40) if page % 5 == 0 else randint(5, 18) wt = randint(1, 5) self.logger.info('Word <{}>, Page <{}> Done, sleeping {}s!'.format(word, page, wt)) # self.driver.implicitly_wait(wt) time.sleep(wt) self.close_browser() @classmethod def crawl_with_threads(cls): pool = ThreadPool(4) total_words = QueryWords().get_query_words() for bulk_words in total_words: try: pool.map(lambda w: cls().crawl_single(w), bulk_words) except Exception as e: cls.logger.info('Threads crawl error: type <{}>, msg <{}>'.format(e.__class__, e)) pool.close() pool.join() in_client.close() def close_browser(self): try: self.driver.close() except (NoSuchWindowException,): pass
class WeixinPhantomjs(Base): def __init__(self): self.start_page = START_PAGE self.end_page = END_PAGE self.weixin_url = REFER_FIRST # self.driver = Firefox() if hasattr(config, 'PHANTOMJS_PATH'): self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH')) else: self.driver = PhantomJS() self.client = MongoClient(HOST, PORT) self.collection = self.client[DB][COLLECTION] self.all_uids = self.uids def open_weixin_browser(self, word): try: self.driver.get(self.weixin_url) self.driver.set_page_load_timeout(3) self.driver.find_element_by_id('upquery').send_keys(word) self.driver.find_element_by_class_name('swz').click() time.sleep(3) urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() except Exception as e: storage_word.append([word, 0]) self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e)) self.close_browser() return True return False def get_total_pages_to_word(self): pages = [] page_id_css = 'pagebar_container' try: e = self.driver.find_element_by_id(page_id_css) for _p in e.text.split(): _p = _p.strip() if not _p.isdigit(): return DEFAULT_PAGES if DEFAULT_PAGES <= pages[-1] else pages[-1] else: pages.append(int(_p)) return 1 except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError): pass return 1 def get_query_words(self, word): query_words = [] for docs in self.collection.find({}, {'rel': 1, 'conp': 1}).sort([('_id', 1)]): w = docs['conp'] if w not in query_words: query_words.append(w) for item in docs['rel']: if item not in query_words: query_words.append(item) self.client.close() return self.query_index(query_words, word) @property def uids(self): return {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs} def extract_urls_uids(self, word): urls_uids = [] timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')] urls_tits = [(t.get_attribute('href'), self.trim(t.text)) for t in self.driver.find_elements_by_css_selector('h4 a')] if len(urls_tits) != len(timestamp): return urls_uids for index, url_tit in enumerate(urls_tits): try: uid = self.md5(timestamp[index] + url_tit[1] + word) if uid not in self.all_uids: self.all_uids.add(uid) urls_uids.append({'url': url_tit[0], 'uid': uid}) except (TypeError, IndexError): pass return urls_uids @staticmethod def query_index(words, cut_word): temp_words = words[START_INDEX:END_INDEX] try: index = temp_words.index(cut_word) return temp_words[index:], index + START_INDEX except ValueError: pass return temp_words, START_INDEX @property def is_forbidden(self): css_id = 'seccodeForm' try: if self.driver.find_element_by_id(css_id): return True except NoSuchElementException: pass return False def appear_element(self, by): try: # Have `click` function to specified element tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by)) tag.click() return True except (TimeoutException, NoSuchWindowException, NoSuchElementException): pass return False def crawl(self, word=None, go=0): is_go = True is_break = False go_page = int(go) next_page_css = 'sogou_page_%s' query_words, ind = self.get_query_words(word) for index, word in enumerate(query_words, 1): next_ind = ind + index is_break = self.open_weixin_browser(word) pages = self.get_total_pages_to_word() for page in range(self.start_page + 1, (pages or self.end_page) + 1): if is_go and page < go_page: continue else: is_go = False if not self.appear_element(by=next_page_css % page): is_break = True msg = '\tNot appear next page element, will break' elif self.is_forbidden: is_break = True msg = '\tSpider was forbidden, crawling again after sleeping a moment!' if is_break: storage_word.append([word, page]) self.logger.info(msg) break urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() # self.driver.find_element_by_id(next_page_css % page).click() wt = randint(10, 40) if page % 3 == 0 else randint(5, 18) self.logger.info('Index <{}>, Word <{}>, Page <{}> Done, sleeping {}s!'.format(next_ind, word, page, wt)) # self.driver.implicitly_wait(wt) time.sleep(wt) if is_break: break in_client.close() self.close_browser() def close_browser(self): try: self.driver.close() except (NoSuchWindowException,): pass
def get_url_files(retail, invoice_doc_type, invoice_id, invoice_date, invoice_amount): retail_invoice_url = RETAIL_INVOICE_URL[retail] driver = PhantomJS() driver.get(retail_invoice_url) # 1 Set doc_type 'select' try: select_doc_type = Select(driver.find_element_by_name('txtTipoDte')) value = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['value'] select_doc_type.select_by_value(value) # name = RETAIL_INVOICE_DOC_TYPES[retail][invoice_doc_type]['name'] # select_doc_type.select_by_visible_text(name) except Exception: print 'ERROR: set doc_type select as Boleta' driver.save_screenshot('screen.png') return '', '' time.sleep(5) # 2 Get recaptcha img url try: recaptcha_img = driver.find_element_by_id('recaptcha_challenge_image') recaptcha_img_url = recaptcha_img.get_attribute('src') except Exception: print 'ERROR: get recaptcha image url' driver.save_screenshot('screen.png') return '', '' # 3 Solve recaptcha v = VisionApi() recaptcha_value = v.detect_text_from_url(recaptcha_img_url) if recaptcha_value is None: print 'ERROR: solving recaptcha image' driver.save_screenshot('screen.png') return '', '' # 4 Fill form script = u""" document.getElementsByName('txtFolio')[0].value = '{invoice_id}'; document.getElementsByName('txtFechaEmision')[0].value = '{invoice_date}'; document.getElementsByName('txtMontoTotal')[0].value = '{invoice_amount}'; document.getElementsByName('recaptcha_response_field')[0].value = '{recaptcha_value}'; """.format( invoice_id=invoice_id, invoice_date=invoice_date, invoice_amount=invoice_amount, recaptcha_value=recaptcha_value, ) driver.execute_script(script) # 5 Submit form try: driver.find_element_by_name('frmDatos').submit() except Exception: print 'ERROR: submitting form' driver.save_screenshot('screen.png') return '', '' # 6 Get url files try: xml_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[2]') pdf_a_tag = driver.find_element_by_xpath('//*[@id="Tabla_01"]/tbody/tr[1]/td[2]/p/a[1]') xml_url = xml_a_tag.get_attribute('href') pdf_url = pdf_a_tag.get_attribute('href') except Exception: print 'ERROR: getting url files' driver.save_screenshot('screen.png') return '', '' # 8 Delete driver session driver.close() driver.quit() return xml_url, pdf_url
import time import json dr = PhantomJS() url = "https://www.v2ex.com/signin" u = 'username' p = 'password' dr.get(url) account = dr.find_element_by_name('u') account.clear() account.send_keys(u) password = dr.find_element_by_name('p') password.clear() password.send_keys(p) login_btn = dr.find_elements_by_css_selector('.super.normal.button')[1] login_btn.click() mission_daily_url = 'https://www.v2ex.com/mission/daily' dr.get(mission_daily_url) get_daily_award = dr.find_element_by_css_selector('.super.normal.button') get_daily_award.click() balance_url = 'https://www.v2ex.com/balance' dr.get(balance_url) intergal = dr.find_elements_by_css_selector('.positive')[0] yue = dr.find_elements_by_css_selector('.balance_area')[0] now = time.strftime("%y-%m-%d %H:%M:%S") item = {"Time": now, "get": intergal.text, "all": yue.text} with open('v2ex_sign.json', "a") as fp: fp.write(json.dumps(item)) dr.close()