def onegoogolePR(self, url): '''返回单个PR''' prUrl = 'http://pr.chinaz.com' # 谷歌PR查询地址 driver = PhantomJS() driver.get(prUrl) driver.find_element_by_id('PRAddress').send_keys(url) driver.find_element_by_class_name('search-write-btn').click() try: imgsrc = driver.find_element_by_css_selector('span#pr>img').get_attribute('src') pr = search(r'\d', imgsrc).group() except: pr = '暂无数据' driver.quit() return pr
class plugin: def __init__(self): APP_ROOT = os.path.dirname(os.path.abspath(__file__)) print(APP_ROOT) self.req = 0 self.driver = PhantomJS(APP_ROOT + "/phantomjs", service_log_path=os.path.devnull) self.driver.implicitly_wait(3) def restart(self): self.__init__() def frame_search(self, path): framedict = {} for child_frame in self.driver.find_elements_by_tag_name('frame'): child_frame_name = child_frame.get_attribute('name') framedict[child_frame_name] = {'framepath': path, 'children': {}} xpath = '//frame[@name="{}"]'.format(child_frame_name) self.driver.switch_to.frame( self.driver.find_element_by_xpath(xpath)) framedict[child_frame_name]['children'] = self.frame_search( framedict[child_frame_name]['framepath'] + [child_frame_name]) self.driver.switch_to.default_content() if len(framedict[child_frame_name]['framepath']) > 0: for parent in framedict[child_frame_name]['framepath']: parent_xpath = '//frame[@name="{}"]'.format(parent) self.driver.switch_to.frame( self.driver.find_element_by_xpath(parent_xpath)) return framedict def tmon(self): self.driver.get( "https://login.ticketmonster.co.kr/user/loginform?return_url=") self.driver.find_element_by_name('userid').send_keys( config['ACCOUNT']['tmon_id']) self.driver.find_element_by_name('password').send_keys( config['ACCOUNT']['tmon_pw']) self.driver.find_element_by_xpath('//*[@id="loginFrm"]/a[2]').click() self.driver.get( 'http://m.benefit.ticketmonster.co.kr/promotions/page/attendance?view_mode=app' ) self.driver.find_element_by_xpath( '//*[@id="attn_wrap"]/div/div/div[3]/div[2]/div[1]/button').click( ) print(self.driver.find_element_by_class_name('content').text) self.tmon_ret = self.driver.find_element_by_class_name('content').text def ondisk(self): try: self.driver.get("http://ondisk.co.kr/index.php") self.driver.implicitly_wait(3) self.driver.find_element_by_xpath('//*[@id="mb_id"]').send_keys( config['ACCOUNT']['ondisk_id']) self.driver.find_element_by_xpath( '//*[@id="page-login"]/form/div[2]/p[2]/input').send_keys( config['ACCOUNT']['ondisk_pw']) self.driver.find_element_by_xpath( '//*[@id="page-login"]/form/div[2]/p[3]/input').click() self.driver.get( "http://ondisk.co.kr/index.php?mode=eventMarge&sm=event&action=view&idx=746&event_page=1" ) self.driver.switch_to_frame(1) self.driver.execute_script( "window.alert = function(msg){ window.msg = msg; };") self.driver.find_element_by_class_name('button').click() alert_text = self.driver.execute_script("return window.msg;") print(alert_text) except: print("ERR") print(self.driver.page_source) self.ondisk_ret = alert_text def ok_cash_bag(self): today = datetime.datetime.now().strftime("%Y%m%d") sess = requests.session() getdata = sess.get( "https://www.facebook.com/login.php?login_attempt=1&next=https%3A%2F%2Fwww.facebook.com%2Fv2.6%2Fdialog%2Foauth%3Fredirect_uri%3Dhttps%253A%252F%252Fmember.okcashbag.com%252Focb%252FsocialId%252FfacebookProcessor%26scope%3Dpublic_profile%252Cuser_birthday%252Cemail%26client_id%3D645711852239977%26ret%3Dlogin%26logger_id%3D91698e1d-fe1e-b325-4c13-b62636843a9e&lwv=101" ) param = { "lsd": "AVpmy4vJ", "api_key": "645711852239977", "cancel_url": "https://member.okcashbag.com/ocb/socialId/facebookProcessor?error=access_denied&error_code=200&error_description=Permissions+error&error_reason=user_denied#_=_", "display": "page", "enable_profile_selector": "", "isprivate": "", "legacy_return": "0", "profile_selector_ids": "", "return_session": "", "skip_api_login": "******", "signed_next": "1", "trynum": "1", "timezone": "-540", "lgndim": "eyJ3IjoxOTIwLCJoIjoxMDgwLCJhdyI6MTkyMCwiYWgiOjEwNDAsImMiOjI0fQ==", "lgnrnd": "173648_UqkK", "lgnjs": "1528418208", "email": config['ACCOUNT']['fb_id'], "pass": config['ACCOUNT']['fb_pw'], "prefill_contact_point": config['ACCOUNT']['fb_id'], "prefill_source": "last_login", "prefill_type": "contact_point", "first_prefill_source": "last_login", "first_prefill_type": "contact_point", "had_cp_prefilled": "true", "had_password_prefilled": "false" } postdata = sess.post( "https://www.facebook.com/login.php?login_attempt=1&next=https%3A%2F%2Fwww.facebook.com%2Fv2.6%2Fdialog%2Foauth%3Fredirect_uri%3Dhttps%253A%252F%252Fmember.okcashbag.com%252Focb%252FsocialId%252FfacebookProcessor%26scope%3Dpublic_profile%252Cuser_birthday%252Cemail%26client_id%3D645711852239977%26ret%3Dlogin%26logger_id%3D91698e1d-fe1e-b325-4c13-b62636843a9e&lwv=101", data=param) # print(postdata.text) postdata = sess.post( "https://member.okcashbag.com//ocb/socialId/socialIdLoginProcess/42100/687474703A2F2F7777772e6f6b636173686261672e636f6d2F696e6465782e646f3F6c6f67696e3D59" ) samlResponse = postdata.text.split("samlResponse.value = \"")[1].split( "\"")[0] # print(samlResponse) param = {"samlResponse": samlResponse, "sst_cd": "", "return_url": ""} postdata = sess.post("http://www.okcashbag.com/index.do?login=Y", data=param) print( postdata.text.split('<span id="profileNickname" class="name">') [1].split("</span>")[0] + "님 로그인") print( postdata.text.split('<span id="spanUsablePoint">')[1].split( '</span>')[0] + "포인트") getdata = sess.get( "http://www.okcashbag.com/life/event/attend/attendMain.do") param = {"method": "", "myUrl": "", "recommUser": "", "today": today} postdata = sess.post( "http://www.okcashbag.com/life/event/attend/attend.do", data=param) print(postdata.text) if len(postdata.text.split('<i class="win-point">')) > 1: print(postdata.text.split('<i class="win-point">')[1] + "포인트 적립") elif len(postdata.text.split("success")) > 1: print("출석체크 완료 ") self.ok_ret = "출석체크 완료" else: print('이미 출석체크 완료') self.ok_ret = "이미 출석체크 완료"
class RouteStatistic(object): def __init__(self, url, phantomjs=None, resolution=None, ya_class=None, screen_path=None, screen_pattern=None, csv_path=None): self.url = url self.phantomjs = phantomjs or DEFAULT_PHANTOMJS assert os.path.isfile(self.phantomjs), "phantomjs не найден" resolution = resolution or FULLHD assert isinstance(resolution, (list, tuple)) assert len(resolution) == 2 self.ya_class = ya_class or DEFAULT_YA_CLASS self.screen_path = screen_path or PATH self.screen_pattern = screen_pattern or '%s.png' assert '%s' in self.screen_pattern self.csv_path = csv_path or os_join(PATH, 'statistic.csv') self.driver = PhantomJS(self.phantomjs) self.driver.set_window_size(*resolution) def track(self): self.driver.get(self.url) WebDriverWait(self.driver, 5).until(is_class_exist(self.ya_class)) time = self.driver.find_element_by_class_name(self.ya_class).text now = datetime.now() self._save_screenshot(now) self._update_file(now, *[t.strip() for t in time.split(',')]) def _save_screenshot(self, now): if '%s' in self.screen_pattern: file_name = self.screen_pattern % (now, ) else: file_name = self.screen_pattern file_name = os_join(self.screen_path, file_name) self.driver.save_screenshot(file_name) def _update_file(self, now, time, distance): with open(self.csv_path, 'a') as csvfile: writer = csv.writer(csvfile, delimiter=str('\t')) writer.writerow([ now, time, distance, ]) def __call__(self): return self.track() def __del__(self): if hasattr(self, 'driver') and self.driver: self.driver.service.process.send_signal(signal.SIGTERM) self.driver.quit()
driver = PhantomJS( './phantomjs') # in case of PhantomJS not available, we can use Firefox for line in tqdm(inputfile, total=numLines, desc='Crawling Instagram', leave=True): try: idtweet, url = line.replace('\n', '').split(',') if idtweet in setUrlDefined: continue except IndexError: print colorama.Fore.RED, 'Corrupted Line', colorama.Fore.RESET continue try: driver.get(url) placetag = driver.find_element_by_class_name('_kul9p') placeurl = placetag.get_attribute('href').encode('utf-8') placename = placetag.get_attribute('title').encode('utf-8') usernametag = driver.find_element_by_class_name('_4zhc5') username = usernametag.get_attribute('title').encode('utf-8') except selenium.common.exceptions.NoSuchElementException: try: error = driver.find_element_by_class_name('error-container') print colorama.Fore.RED, 'Sample Not Available Anymore', colorama.Fore.RESET outputfile.write(idtweet + ',' + url + ',404\n') continue except selenium.common.exceptions.NoSuchElementException: print colorama.Fore.RED, 'No Coords Available', colorama.Fore.RESET outputfile.write(idtweet + ',' + url + ',NoCoords\n')
outputfile = open(cityName + '-instagram-output.csv', 'a', 0) print colorama.Back.RED+colorama.Fore.YELLOW+str(len(setUrlDefined))+' URLs already defined! Lets Rock more now...'+colorama.Back.RESET+colorama.Fore.RESET driver = PhantomJS('./phantomjs') # in case of PhantomJS not available, we can use Firefox for line in tqdm(inputfile, total=numLines, desc='Crawling Instagram', leave=True): try: idtweet, url = line.replace('\n', '').split(',') if idtweet in setUrlDefined: continue except IndexError: print colorama.Fore.RED, 'Corrupted Line', colorama.Fore.RESET continue try: driver.get(url) placetag = driver.find_element_by_class_name('_kul9p') placeurl = placetag.get_attribute('href').encode('utf-8') placename = placetag.get_attribute('title').encode('utf-8') usernametag = driver.find_element_by_class_name('_4zhc5') username = usernametag.get_attribute('title').encode('utf-8') except selenium.common.exceptions.NoSuchElementException: try: error = driver.find_element_by_class_name('error-container') print colorama.Fore.RED, 'Sample Not Available Anymore', colorama.Fore.RESET outputfile.write(idtweet + ',' + url + ',404\n') continue except selenium.common.exceptions.NoSuchElementException: print colorama.Fore.RED, 'No Coords Available', colorama.Fore.RESET outputfile.write(idtweet + ',' + url + ',NoCoords\n')
class WeixinPhantomjs(Base): all_uids = {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs} def __init__(self): self.start_page = START_PAGE self.end_page = END_PAGE self.weixin_url = REFER_FIRST # self.driver = Firefox() if hasattr(config, 'PHANTOMJS_PATH'): self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH')) else: self.driver = PhantomJS() def open_weixin_browser(self, word): try: self.driver.get(self.weixin_url) self.driver.set_page_load_timeout(3) self.driver.find_element_by_id('upquery').send_keys(word) self.driver.find_element_by_class_name('swz').click() time.sleep(3) urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() except Exception as e: storage_word.append([word, 0]) self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e)) self.close_browser() return True return False def get_total_pages_to_word(self): pages = [] page_id_css = 'pagebar_container' try: e = self.driver.find_element_by_id(page_id_css) for _p in e.text.split(): _p = _p.strip() if not _p.isdigit(): return DEFAULT_PAGES if DEFAULT_PAGES <= pages[-1] else pages[-1] else: pages.append(int(_p)) return 1 except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError): pass return 1 def extract_urls_uids(self, word): urls_uids = [] timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')] urls_tits = [(t.get_attribute('href'), self.trim(t.text)) for t in self.driver.find_elements_by_css_selector('h4 a')] if len(urls_tits) != len(timestamp): return urls_uids for index, url_tit in enumerate(urls_tits): try: uid = self.md5(timestamp[index] + url_tit[1] + word) if uid not in self.__class__.all_uids: self.__class__.all_uids.add(uid) urls_uids.append({'url': url_tit[0], 'uid': uid}) except (TypeError, IndexError): pass return urls_uids @property def is_forbidden(self): css_id = 'seccodeForm' try: if self.driver.find_element_by_id(css_id): return True except NoSuchElementException: pass return False def appear_element(self, by): try: # Have `click` function to specified element tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by)) tag.click() return True except (TimeoutException, NoSuchWindowException, NoSuchElementException): pass return False def crawl_single(self, word=None, go=0): is_go = True go_page = int(go) next_page_css = 'sogou_page_%s' is_break = self.open_weixin_browser(word) pages = self.get_total_pages_to_word() for page in range(self.start_page + 1, (pages or self.end_page) + 1): if is_go and page < go_page: continue else: is_go = False if not self.appear_element(by=next_page_css % page): is_break = True msg = '\tNot appear next page element, will break' elif self.is_forbidden: is_break = True msg = '\tSpider was forbidden, crawling again after sleeping a moment!' if is_break: storage_word.append([word, page]) self.logger.info(msg) break urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() # self.driver.find_element_by_id(next_page_css % page).click() # wt = randint(10, 40) if page % 5 == 0 else randint(5, 18) wt = randint(1, 5) self.logger.info('Word <{}>, Page <{}> Done, sleeping {}s!'.format(word, page, wt)) # self.driver.implicitly_wait(wt) time.sleep(wt) self.close_browser() @classmethod def crawl_with_threads(cls): pool = ThreadPool(4) total_words = QueryWords().get_query_words() for bulk_words in total_words: try: pool.map(lambda w: cls().crawl_single(w), bulk_words) except Exception as e: cls.logger.info('Threads crawl error: type <{}>, msg <{}>'.format(e.__class__, e)) pool.close() pool.join() in_client.close() def close_browser(self): try: self.driver.close() except (NoSuchWindowException,): pass
class WeixinPhantomjs(Base): def __init__(self): self.start_page = START_PAGE self.end_page = END_PAGE self.weixin_url = REFER_FIRST # self.driver = Firefox() if hasattr(config, 'PHANTOMJS_PATH'): self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH')) else: self.driver = PhantomJS() self.client = MongoClient(HOST, PORT) self.collection = self.client[DB][COLLECTION] self.all_uids = self.uids def open_weixin_browser(self, word): try: self.driver.get(self.weixin_url) self.driver.set_page_load_timeout(3) self.driver.find_element_by_id('upquery').send_keys(word) self.driver.find_element_by_class_name('swz').click() time.sleep(3) urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() except Exception as e: storage_word.append([word, 0]) self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e)) self.close_browser() return True return False def get_total_pages_to_word(self): pages = [] page_id_css = 'pagebar_container' try: e = self.driver.find_element_by_id(page_id_css) for _p in e.text.split(): _p = _p.strip() if not _p.isdigit(): return DEFAULT_PAGES if DEFAULT_PAGES <= pages[-1] else pages[-1] else: pages.append(int(_p)) return 1 except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError): pass return 1 def get_query_words(self, word): query_words = [] for docs in self.collection.find({}, {'rel': 1, 'conp': 1}).sort([('_id', 1)]): w = docs['conp'] if w not in query_words: query_words.append(w) for item in docs['rel']: if item not in query_words: query_words.append(item) self.client.close() return self.query_index(query_words, word) @property def uids(self): return {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs} def extract_urls_uids(self, word): urls_uids = [] timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')] urls_tits = [(t.get_attribute('href'), self.trim(t.text)) for t in self.driver.find_elements_by_css_selector('h4 a')] if len(urls_tits) != len(timestamp): return urls_uids for index, url_tit in enumerate(urls_tits): try: uid = self.md5(timestamp[index] + url_tit[1] + word) if uid not in self.all_uids: self.all_uids.add(uid) urls_uids.append({'url': url_tit[0], 'uid': uid}) except (TypeError, IndexError): pass return urls_uids @staticmethod def query_index(words, cut_word): temp_words = words[START_INDEX:END_INDEX] try: index = temp_words.index(cut_word) return temp_words[index:], index + START_INDEX except ValueError: pass return temp_words, START_INDEX @property def is_forbidden(self): css_id = 'seccodeForm' try: if self.driver.find_element_by_id(css_id): return True except NoSuchElementException: pass return False def appear_element(self, by): try: # Have `click` function to specified element tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by)) tag.click() return True except (TimeoutException, NoSuchWindowException, NoSuchElementException): pass return False def crawl(self, word=None, go=0): is_go = True is_break = False go_page = int(go) next_page_css = 'sogou_page_%s' query_words, ind = self.get_query_words(word) for index, word in enumerate(query_words, 1): next_ind = ind + index is_break = self.open_weixin_browser(word) pages = self.get_total_pages_to_word() for page in range(self.start_page + 1, (pages or self.end_page) + 1): if is_go and page < go_page: continue else: is_go = False if not self.appear_element(by=next_page_css % page): is_break = True msg = '\tNot appear next page element, will break' elif self.is_forbidden: is_break = True msg = '\tSpider was forbidden, crawling again after sleeping a moment!' if is_break: storage_word.append([word, page]) self.logger.info(msg) break urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() # self.driver.find_element_by_id(next_page_css % page).click() wt = randint(10, 40) if page % 3 == 0 else randint(5, 18) self.logger.info('Index <{}>, Word <{}>, Page <{}> Done, sleeping {}s!'.format(next_ind, word, page, wt)) # self.driver.implicitly_wait(wt) time.sleep(wt) if is_break: break in_client.close() self.close_browser() def close_browser(self): try: self.driver.close() except (NoSuchWindowException,): pass
from selenium.webdriver import Firefox, PhantomJS driver = PhantomJS() url = ('https://www.google.com/finance?start=0&num=5000&q=%5B(exchange%20%3D' '%3D%20"{}")%20%26%20(last_price%20>%200.1)%20%26%20(last_price%20<' '%201500)%5D&restype=company&noIL=1') driver.get(url.format('NYSE')) nyse = (elem.text for elem in driver.find_elements_by_class_name('symbol')) driver.get('https://www.google.com/finance?q=NYSE%3A{}'.format(list(nyse)[0])) print driver.find_element_by_class_name('pr').text # driver.get(url.format('NASDAQ')) # nasdaq = (elem.text for elem in driver.find_elements_by_class_name('symbol')) # print '\n'.join(list(nasdaq))