def load(self, path=None, force_load=None, auto_cast=None, load_empty=None, merge_how='right'): """Load configuration from file. If force-load, reload_default values on error..""" path = self._path if path is None else Path(path) auto_cast = self._auto_cast if auto_cast is None else auto_cast force_load = self._force_load if force_load is None else force_load load_empty = self._load_empty if load_empty is None else load_empty if not path.isfile: if force_load: self.reload_default() return None config_dict = self.read_config(path, auto_cast=auto_cast) if config_dict is None: if force_load: self.reload_default() return None elif path.isfile: # Change path self._path = path logger.debug("Path changed to '{}' with 'load' method.".format(self.path)) elif config_dict.isempty: if not load_empty: logger.info("Configuration loaded is empty! It won't replace the existing one.") logger.debug("Use 'clear' method to empty the ConfigDict") return None else: raise UnknownError("bad case in 'load'") # config_dict.section = self.section # config_dict.default_section = self.default_section # config_dict._conversion_dict = self._conversion_dict self.merge(config_dict, how=merge_how, inplace=True) logger.info("Configuration loaded!")
def pause_rest_alert(self, option_text): self.is_rest_alert = not self.is_rest_alert if self.is_rest_alert: self.rest_start_time = time() logger.info("[休息提醒] 开启休息提醒") else: logger.info("[休息提醒] 关闭休息提醒")
def start_crawl(self): for start_urls in self.rules: urls = start_urls['resources'] gfw = start_urls['GFW'] name = start_urls['name'] page_type = start_urls['type'] referer = start_urls['referer'] host = start_urls['host'] anti_crawl = start_urls['AntiCrawl'] cookies = None if anti_crawl: cookies = eval('crack()', { 'crack': eval('self.crack_anti_crawl.crack_{}'.format(name)) }) ua = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:67.0) Gecko/20100101 Firefox/67.0' headers = {'User-Agent': ua, 'Referer': referer, 'Host': host} # loop = asyncio.get_event_loop() loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) tasks = [ self.proxy_downlaod(url, gfw, page_type, name, headers, cookies) for url in urls ] loop.run_until_complete(asyncio.wait(tasks)) # 检测有效proxy数量,达到指定数量,停止爬取 validated_proxy_num = self.redis.count(VALIDATED_SCORE, VALIDATED_SCORE) if validated_proxy_num >= VALIDATED_PROXY_NUM: break settings.SPIDER_RUNNING = False logger.info('scrawl finished')
def activate_assets(self, idlist_file, item, asset): with open(idlist_file) as f: id_list = [i.strip() for i in f.readlines()] logger.info('%d available %s images' % (len(id_list), item)) self.process_activation(self.trigger_activation, id_list, item, asset) self.process_activation(self.check_activation, id_list, item, asset)
def get_user_date(title=None, message=None, default_date=None, dateformat="%Y-%m-%d", bypass_dialog=False, behavior_on_error='ignore'): """Main function to get a date from user.""" import warnings warnings.warn( "get_user_date is deprecated. Use simpledialog.askdate instead", DeprecationWarning) if bypass_dialog: return default_date or datetime.datetime.now() if title is None: title = "Select a date" if message is None: message = "Enter the date (ISO format YYYY-MM-DD):" m_root = DatePickerTk(title=title, message=message, default_date=default_date, dateformat=dateformat) m_root.mainloop() date_selected = m_root.date_picker.current_date logger.info('Date picked: {}'.format(date_selected)) if not date_selected: msg = 'Wrong date selected. Please try again.' raise_anomaly(flag=behavior_on_error, error=ValueError, title=msg, message=msg) return date_selected
def pause_slack_alert(self, option_text): self.is_slack_alert = not self.is_slack_alert if self.is_slack_alert: self.slack_start_time = time() logger.info("[休息提醒] 开启摸鱼提醒") else: logger.info("[休息提醒] 关闭摸鱼提醒")
def _get_func_time(*args, **kwargs): now1 = datetime.now() result = func(*args, **kwargs) now2 = datetime.now() logger.info("func is {0}, time spending is {1}".format( func.__name__, (now2 - now1).total_seconds())) return result
def test_002(self, drivers): """测试搜索候选""" search = SearchPage(drivers) search.input_search("selenium") logger.info(list(search.imagine)) # assert all(["selenium" in i for i in search.imagine]) assert not all(["selenium" in i for i in search.imagine])
async def get_page(self, page_num): await self.page.bringToFront() logger.info("订单列表页爬虫,第 " + str(page_num) + " 页开始") self.completed = 0 try: await self.page.waitForSelector(".pagination-options-go") await self.page.focus(".pagination-options input") for _ in range(3): await self.page.keyboard.press("Delete") await self.page.keyboard.press("Backspace") await self.listening(self.page) await self.page.type(".pagination-options input", str(page_num)) await self.page.keyboard.press("Enter") # await self.page.waitForResponse(self.url) # while self.captcha: # t = await self.login.slider(self.page) # if t: # return t except Exception as e: if re.search('\"\.pagination-options-go\"', str(e)): t = await self.login.slider(self.page) if t: return t else: logger.error(str(e)) while not self.completed: await self.login.slider(self.page) await asyncio.sleep(2) logger.info("订单列表页爬虫,第 " + str(page_num) + " 页完成") await my_async_sleep(15, True) return self.completed
def get_session(name, password): url, cid, session = do_login(name, password) if url != '': _headers = headers _headers['Host'] = 'passport.weibo.com' # _response = requests.get(url, headers=_headers, verify=False, allow_redirects=False) rs_cont = session.get(url, verify=False, allow_redirects=False) # login_info = rs_cont.text # # u_pattern = r'"uniqueid":"(.*)",' # m = re.search(u_pattern, login_info) if rs_cont.status_code == 302: # 访问微博官方账号看是否正常 check_url = 'http://weibo.com/2671109275/about' # check_url = 'https://weibo.cn/u/1669879400' resp = session.get(check_url) # 通过实验,目前发现未经过手机验证的账号是救不回来了... if rs_cont.status_code == 403: logger.error(u'账号{}已被冻结'.format(name)) # freeze_account(name, 0) return None logger.info(u'本次登陆账号为:{}'.format(name)) return session return None
def route(self, data): logger.info("get a data:" + str(data)) task_ID = parse.paresID(data) if data[parse.MAIN_KEY.request] == parse.REQUEST.startTask: return self.startTask(task_ID) elif data[parse.MAIN_KEY.request] == parse.REQUEST.stopTask: return self.stopTask(task_ID)
async def init_page_to_listening(self): # 获取存储在tools/data里的ip代理 if PROXY_SWITCH: proxy = read('item_proxy') if not proxy: proxy = self._set_proxy() logger.info("当前代理IP:" + proxy) # 获取一个使用代理的浏览器 self._browser = await launch(headless=True, executablePath=CHROME_PATH, args=[f'--proxy-server={proxy}']) else: self._browser = await launch(headless=True, executablePath=CHROME_PATH) # self._browser = await launch(autoClose=False, headless=False, args=[f'--proxy-server={proxy}']) # 获取一个浏览器的page对象 self._page = await self._browser.newPage() # 设置page的请求头的user_agent await self._page.setUserAgent(get_user_agent()) # 监听page的request,response事件,触发回调至intercept_response,intercept_request await self.listening(self._page) # 在数据库中获取要完成爬虫的任务 self._item = self._get_item() try: await self._page.goto(self.base_url + self._item['link_id']) except errors.TimeoutError: pass except errors.PageError: if PROXY_SWITCH: self._set_proxy() return while self.exit_signal: await asyncio.sleep(50)
def _get_user_info(self): # getting user name try: selector = etree.HTML(self.html) self.user_info['userName'] = selector.xpath( '//table//div[@class="ut"]/span/text()')[0] logger.info('user name is %s' % self.user_info['userName']) except Exception as e: logger.error('getting user name failed for:{}'.format(str(e))) # getting user other info try: selector = etree.HTML(self.html) pattern = r"\d+\.?\d*" str_wb = selector.xpath('//span[@class="tc"]/text()')[0] guid = re.findall(pattern, str_wb, re.S | re.M) for value in guid: num_wb = int(value) break self.user_info['weiboNum'] = num_wb str_gz = selector.xpath("//div[@class='tip2']/a/text()")[0] guid = re.findall(pattern, str_gz, re.M) self.user_info['following'] = int(guid[0]) str_fs = selector.xpath("//div[@class='tip2']/a/text()")[1] guid = re.findall(pattern, str_fs, re.M) self.user_info['followers'] = int(guid[0]) logger.info( 'current user all weibo num {}, following {}, followers {}'. format(self.user_info['weiboNum'], self.user_info['following'], self.user_info['followers'])) except Exception as e: logger.error('getting user info failed for:{}'.format(str(e)))
def build(args): if not args.grammar and not args.caramel and not args.all: logger.warn('Nothing to build.') return logger.info('Build started...') start_time = time() if args.debug and not args.caramel: logger.warn('--release ignored because -c / --caramel is absent.') args.debug = False if args.all: args.grammar = True args.caramel = True if args.grammar: build_grammar(args) if args.caramel: build_caramel(args) total_time = time() - start_time logger.info('Build finished. Total time: {}.'.format( colored(seconds_to_string(total_time), color='yellow')))
def read_book(url): f = requests.get(url) # Get该网页从而获取该html内容 soup = BeautifulSoup(f.content, "lxml") # 用lxml解析器解析该网页的内容, 好像f.text也是返回的html n1 = datetime.now() title = ['书名', '作者', '出版社', '出版日期', '价格', '评分', '参与参分人', '简述', '详情链接地址'] result = np.array([title], dtype=str) # result = [title] book_info = soup.find_all('li', class_='subject-item') for k in book_info: # ,找到div并且class为pl2的标签 book_name = k.find('h2').a['title'] book_detail_url = k.find('h2').a['href'] auth_info = k.find('div', class_='pub').string auth_info = auth_info.rsplit("/", 3) auth_info = list(map(lambda x: x.strip(), auth_info)) book_point = k.find("span", class_="rating_nums").string person = k.find("span", class_="pl").string person = person.strip()[:-4][1:] resume = k.p.string current_info = [book_name] + auth_info + [book_point, person, resume, book_detail_url] result = np.append(result, [current_info], axis=0) # result.append(current_info) print(result) generate_excel("豆瓣读书", "小说", "./", result) n2 = datetime.now() logger.info((n2 - n1).total_seconds())
def mail_pic(*args): msg = MIMEMultipart() msg['From'] = Header(MAIL_SENDER) # 发送者 msg['To'] = Header(",".join(*args)) msg['Subject'] = "登陆二维码" # 邮件的主题,也可以说是标题 boby = """ <br><img src="cid:image1"></br> """ mail_body = MIMEText(boby, _subtype='html', _charset='utf-8') msg.attach(mail_body) fp = open("./qrcode.png", 'rb') images = MIMEImage(fp.read()) fp.close() images.add_header('Content-ID', '<image1>') msg.attach(images) # 构造附件1,传送当前目录下的 test.txt 文件 # att1 = MIMEText(open('qrcode.png', 'rb').read(), 'base64', 'utf-8') # att1["Content-Type"] = 'application/octet-stream' # # 这里的filename可以任意写,写什么名字,邮件中显示什么名字 # att1["Content-Disposition"] = 'attachment; filename="登陆二维码.png"' # msg.attach(att1) try: server = smtplib.SMTP_SSL("smtp.qq.com", 465) server.login(MAIL_SENDER, MAIL_PASS) server.sendmail(MAIL_SENDER, *args, msg.as_string()) server.quit() # smtpObj.sendmail(my_sender, *args, message.as_string()) logger.info("邮件发送成功") except smtplib.SMTPException as e: logger.info("Error: 无法发送邮件") return False else: return True
def request_sys(req_url, request_data, method, reqheaders): logger.info("request_data is {0}".format(request_data)) logger.info("headers is {0}".format(reqheaders)) try: if 'GET' == method: result = requests.get(url=req_url, params=request_data, headers=reqheaders) logger.info("res content is {0}".format(result)) return json.loads(result) elif 'POST' == method: reqheaders['Accept'] = 'application/json' if reqheaders.get('Content-Type') == 'application/json': request_data = json.dumps(request_data, cls=APIEncoder) result = requests.post(url=req_url, data=request_data, headers=reqheaders).content logger.info("res content is {0}".format(result)) return json.loads(result) else: logger.info("method error, current method is {0}".format(method)) except Exception as e: logger.error('request_order_sys access error:%s' % (traceback.format_exc(e), )) return None
def test_backend(args): # TODO: Refactor => factorize semantic and back-end? logger.info('Running back-end tests...') if args.build: from tools.build import build_grammar, build_caramel build_grammar(args) build_caramel(args) if args.interactive and len(args.test_files) > 0: logger.warn('Running in interactive mode, ignoring test files.') args.test_files = [] if args.all: args.test_files = None # Run the tests backend_tests = BackendTests() if args.interactive: backend_tests.add_test('interactive test', '', False) else: backend_tests.discover(PATHS['backend-test-dir'], only=args.test_files) backend_tests.run_all( open_gui=args.gui, open_gui_on_failure=args.gui_on_failure, show_stdout=args.stdout, show_stderr=args.stderr, ) print_failed_tests()
def test_all(args): logger.info('Running all tests...') # Execute grammar tests grammar_args = copy(args) grammar_args.all = True grammar_args.gui = False grammar_args.gui_on_failure = False test_grammar(grammar_args) # Execute semantic tests semantic_args = copy(args) semantic_args.all = True semantic_args.gui = False semantic_args.gui_on_failure = False test_semantic(semantic_args) # Execute back-end tests backend_args = copy(args) backend_args.all = True backend_args.gui = False backend_args.gui_on_failure = False test_backend(backend_args) # Execute programs tests programs_args = copy(args) programs_args.all = True programs_args.gui = False programs_args.gui_on_failure = False test_programs(programs_args) print_failed_tests()
def test_semantic(args): logger.info('Running semantic tests...') if args.build: from tools.build import build_grammar, build_caramel build_grammar(args) build_caramel(args) if args.interactive and len(args.test_files) > 0: logger.warn('Running in interactive mode, ignoring test files.') args.test_files = [] if args.all: args.test_files = None # Run the tests semantic_tests = SemanticTests() if args.interactive: semantic_tests.add_test('interactive test', '', False) else: semantic_tests.discover(PATHS['semantic-test-dir'], only=args.test_files) semantic_tests.run_all( open_gui=args.gui, open_gui_on_failure=args.gui_on_failure, show_stdout=args.stdout, show_stderr=args.stderr, ) print_failed_tests()
def test_grammar(args): logger.info('Running grammar tests...') if args.build: from tools.build import build_grammar build_grammar(args) if args.interactive and len(args.test_files) > 0: logger.warn('Running in interactive mode, ignoring test files.') args.test_files = [] if args.all: args.test_files = None # Run the tests grammar_tests = GrammarTests() grammar_tests.check_build() if args.interactive: grammar_tests.add_test('interactive test', '', False) else: grammar_tests.discover(PATHS['grammar-test-dir'], only=args.test_files) grammar_tests.run_all( open_gui=args.gui, open_gui_on_failure=args.gui_on_failure, show_stdout=args.stdout, show_stderr=args.stderr, ) print_failed_tests()
def _get_item(): column_name = [ "shop_id", "link_id", "description", "price_tb", "promotionprice", "sales", "rates", ] while 1: results = MySql.cls_get_dict(db_setting=TEST_SERVER_DB_TEST, t="tb_master", c={ "isUsed": 0, "isMut": 1, "flag!": "XiaJia" }, cn=column_name, l=["0", "1"]) if results: results[0]['price_tb'] = float(results[0]['price_tb']) results[0]['promotionprice'] = float( results[0]['promotionprice']) results[0]['typeabbrev'] = store_trans(results[0]['shop_id'], 'id_2_code') return results[0] else: logger.info('没有数据需要爬取!') my_sleep()
async def parse(self, main_orders, page_num): # print(main_orders) ms = MySql() t = time_zone(["08:00", "23:00", "23:59"]) a = datetime.datetime.now() if a < t[0]: eoc = EARLIEST_ORDER_CREATE_TIME elif t[0] < a < t[1]: eoc = 2 else: eoc = 60 for i in range(len(main_orders)): continue_code = 0 # 有些订单的商品,在未付款时就已经退掉了,所以直接直接将数据进行删除 # 解析并保存订单到数据库 sub_orders, tb_order_item = await self.parse_order_item( i, main_orders, ms) if not sub_orders: return # 解析并保存订单详细商品到数据库 await self.parse_order_detail_item(continue_code, i, main_orders, sub_orders, tb_order_item, ms) date = datetime.date.today() date_limit = ( date - datetime.timedelta(eoc)).strftime("%Y-%m-%d %H:%M:%S") if tb_order_item.createTime < date_limit: logger.info("完成本轮爬取,共翻 " + str(page_num) + " 页。") self.completed = 2 del ms return self.completed = 1 del ms
def taskerToComplete(ID): msg = "taskerToComplete" logger.info(msg) data = taskDB.getTaskInfo(ID) data.task_finish_time = datetime.datetime.now() taskDB.changeTaskInfo(data) changeTaskerStatus(ID, msg, parse.TASK_STATUS.complete)
async def input_verify_code(self, frame, fromStore, type): logger.info("需要要手机验证码") ms = MySql(db_setting=TEST_SERVER_DB_TEST) ms.delete(t='phone_verify', c={'fromStore': fromStore}) ms.insert(t="phone_verify", d={"fromStore": fromStore}) mail(fromStore + "手机验证码", fromStore + "登陆需要手机验证码", MAIL_RECEIVERS) verify_code = "0" while 1: if type == 0: await frame.click(PHONE_GET_CODE[0]) else: await frame.click(PHONE_GET_CODE[1]) for i in range(120): await asyncio.sleep(5) verify_code = ms.get_one(t='phone_verify', cn=['verify_code'], c={"fromStore": fromStore}) if verify_code != "0": ms.delete(t='phone_verify', c={'fromStore': fromStore}) del ms break if verify_code != "0": break await asyncio.sleep(10) if type == 0: await frame.type(PHONE_CHECK_INPUT[0], verify_code, {'delay': self.input_time_random() - 50}) await frame.click(PHONE_SUBMIT_BTN[0]) else: await frame.type(PHONE_CHECK_INPUT[1], verify_code, {'delay': self.input_time_random() - 50}) await frame.click(PHONE_SUBMIT_BTN[1])
def insertOrUpdateUser(self, usermap): session=Session() user=User() user.openid=usermap["openid"] user.username=usermap["username"] user.password=usermap["password"] flag=self.selectUserFlag(user.openid) if(flag==1): try: session.add(user) session.commit() logger.info("用户"+user.openid+"用户绑定成功") return True except: logger.error("用户"+user.openid+"用户绑定失败") return False elif(flag==2): try: user= session.query(User).filter_by(openid=user.openid).first() user.username=usermap["username"] user.password=usermap["password"] user.flag=1 session.commit() logger.info("用户"+user.openid+"用户更新成功") return True except: logger.error("用户"+user.openid+"用户绑定失败") return False else: logger.error("用户"+user.openid+"用户绑定失败") return False
def scrape_a_user_profile(self, username): # Todo: proxy stats log_scraping_profile(self.session_id, 'begin', 'profile', username) time.sleep( random() * 5 ) # Todo: DOESN'T WORKOtherwise other objects in other processes will also start checking proxies populated, filling the queue with the same proxies self._check_proxy_queue() fail_counter = 0 while fail_counter < self.max_fails: proxy = self.proxy_queue.get() profile_scraper = ProfileScraper(self.c) profile_scraper.proxy_server = proxy logger.info( f'Start scraping profiles | {username}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}, fail={fail_counter}' ) # Todo: When I don't add raise to the get.py / def User(...) / line 197, then fail silently. # No distinction between existing user with proxy failure and canceled account. # When I add raise, twint / asyncio show error traceback in terminal # ? What happens with proxies when username is canceled? Sometimes TimeoutError or TypeError try: # Todo: Refactor: make method and use also in scrape_a_user_tweets profile_df = profile_scraper.execute_scraping(username) except: print('x' * 100) print(sys.exc_info()[0]) print(sys.exc_info()) print('x' * 100) raise else: if profile_df.empty: # ProfileScrapingError logger.error( f'Empty profile | {username}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}, fail={fail_counter}' ) update_proxy_stats('ProfileScrapingError', proxy) fail_counter += 1 time.sleep(random() * 5) else: # ok logger.info( f'Saving profile | {username}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}, fail={fail_counter}' ) log_scraping_profile(self.session_id, 'ok', 'profile', username, proxy=proxy) save_a_profile(profile_df) update_proxy_stats('ok', proxy) self._release_proxy_server(proxy) break finally: if fail_counter >= self.max_fails: # Dead txt = f'dead | {username}, {proxy["ip"]}:{proxy["port"]}, queue={self.proxy_queue.qsize()}, fail={fail_counter}' logger.error(txt) log_scraping_profile(self.session_id, 'dead', f'profile', username, proxy=proxy) log_scraping_profile(self.session_id, 'end', 'profile', username)
def ie(): try: driver = webdriver.Ie(executable_path=IE_DRIVER) driver.maximize_window() driver.close() logger.info('ie driver ok') except Exception as e: logger.error(f'ie driver failed; {e}')
def __check_platform(self): """Проверка, что работает для такой платформы""" if self.platform == 'vk': logger.info('reminders_vk is started') elif self.platform == 'tg': logger.info('reminders_tg is started') else: raise ValueError(f'Нет такой платформы: {self.platform}')
def request(self): try: while True: data = self.queue.get() logger.info("get a data") self.route(data) except: logger.error(traceback.format_exc())