def _single_by_key_city(self): for key in self.keys: if not key: continue for cid in PLUS_CITYS: if not cid: continue if not isinstance(cid, int): cid = int(cid) city_info = self.citys_list.get(cid) if not city_info: LOG.info('@@@@@: %s is not have city information, exit...' % cid) sys.exit(1) city_full_name = city_info.get('full_name') min_page, max_page, max_pagination, max_range = self.tyc_client.get_pagination(key, _type='city', city_id=cid, cityes=self.citys_list) self._is_not_max_range_die(max_range) LOG.info('[%s][%s][%s]spider page: %s ~ %s ||| max_pagination: %s ||| max range: %s' % (RUN_MODE, key, city_full_name, min_page, max_page, max_pagination, max_range)) if not isinstance(max_pagination, int): max_pagination = int(max_pagination) for i in range(0, max_range, 1): max_page = PAGINATION + min_page if max_page > max_pagination: max_page = max_pagination self._print_info('[%s][%s][%s]%s ~ %s' % (RUN_MODE, key, city_full_name, min_page, max_page)) _res = self.tyc_client.work_by_key(key, min_page, max_page, cid=cid, city_info=city_info) self.to_store(key, min_page, max_page, datas=_res) min_page = max_page + 1
def _wrapper(*args, **kwargs): start = datetime.now() res = fn(*args, **kwargs) end = datetime.now() LOG.info('@timeer %s is run: %s' % (fn.__name__, (end - start).seconds)) return res
def login_in(): if request.method == 'GET': is_ok, message = check_login() if not is_ok: return render_template("login.html", login_message="") g.menuf = 'index' g.menusub = 'index' return render_template("index.html") elif request.method == 'POST': form = request.form user_id = form.get('login_user') user_pwd = form.get('login_password') if not user_id: return render_template("login.html", login_message=u'请输入用户信息(ID、电话、邮箱)') if not user_pwd: return render_template("login.html", login_message=u'请输入账号密码') is_register_user = SysUserService().get_user_by_params(user_id) if not is_register_user: return render_template("login.html", login_message=u'账户未注册') # 支持用户id、phone、email登录 user = SysUserService().check_user(user_id, user_pwd) if not user: return render_template("login.html", login_message=u'账号密码不匹配') session['user_id'] = user_id g.menuf = 'index' g.menusub = 'index' LOG.info('%s login in ==========' % user_id) return render_template("index.html") else: return render_template("login.html", login_message="")
def adds(self, datas): if not datas: LOG.error('DB: data is null.') return failure_list = list() success_list = list() for data in datas: if not data: continue if isinstance(data, str): data = dict(data) credit_code = data.get('credit_code') name = data.get('name') if not credit_code: failure_list.append(name) continue model = self.enterprise_bo.get_by_code(credit_code) if model: failure_list.append(name) continue new_model = self.enterprise_bo.new_mode() new_model.name = name new_model.phone = data.get('phone') new_model.email = data.get('email') new_model.tyc_url = data.get('tyc_url') new_model.company_url = data.get('company_url') new_model.address = data.get('address') new_model.register_funds = data.get('register_funds') new_model.paidin_funds = data.get('paidin_funds') new_model.establish_date = data.get('establish_date') new_model.status = data.get('status') new_model.credit_code = credit_code new_model.company_type = data.get('company_type') new_model.industry = data.get('industry') new_model.business_term = data.get('business_term') new_model.resume = data.get('resume') new_model.business_scope = data.get('business_scope') new_model.key = data.get('key') new_model.create_time = get_now() new_model.city = data.get('city') new_model.sub_city = data.get('sub_city') try: self.enterprise_bo.add_model(new_model) success_list.append(name) except Exception as e: LOG.error('DB add error %s: %s' % (e, str(data))) failure_list.append(name) else: if success_list: LOG.info('success list:【%s】' % len(success_list)) if failure_list: LOG.info('failure list:【%s】' % len(failure_list)) return success_list, failure_list
def register_blueprint(self, obj_n, obj): """ view blueprint register :param obj_n: blueprint object :param obj: blueprint name :return: None """ if obj: LOG.info('Blueprint %s is register' % obj_n) self.app.register_blueprint(obj)
def get_pagination(self, key): min_page = 0 max_page = 5 if not key: return min_page, max_page if API_MODE == 'tyc': return min_page, max_page elif API_MODE == 'pro': url = '%s/p%s?key=%s' % (TYC_PRO_SEARCH_API, '0', parse.quote(key)) is_ok, search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') soup = BeautifulSoup(search_resp, 'lxml') search_pagination = soup.find_all('div', class_='search-pagination') def while_req(url): sub_is_ok, sub_search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') return sub_is_ok, sub_search_resp # 添加手动验证功能 if len(search_pagination) == 0 or not is_ok: while 1: if is_ok and len(search_pagination) > 0: break else: LOG.critical('验证############### %s ###############' % url) random_sleep(20, 25) is_ok, search_resp = while_req(url) soup = BeautifulSoup(search_resp, 'lxml') search_pagination = soup.find_all( 'div', class_='search-pagination') l = len(search_pagination[0].find_all('a')) for index_a, a in enumerate(search_pagination[0].find_all('a')): if index_a == (l - 2): max_page = a.string.strip() if max_page.find('...') > -1: max_page = max_page.split('...')[1] if isinstance(max_page, str): max_page = int(max_page) break LOG.info('[%s] pagination max: %s' % (key, max_page)) return min_page, max_page
def _process_by_key_only_sub_city(self): pool = multiprocessing.Pool(processes=(MAX_CPU - 1 if MAX_CPU > 2 else 1)) LOG.info('Main process: %s, run cpu count: %s' % (os.getpid(), (MAX_CPU - 1 if MAX_CPU > 2 else 1))) process = list() for key in self.keys: if not key: continue for sub_cid in PLUS_CITYS: if not sub_cid: continue if not isinstance(sub_cid, int): sub_cid = int(sub_cid) sub_city_info = self.sub_citys_dict.get(sub_cid) if not sub_city_info: LOG.info('@@@@@: %s is not have sub_city information, exit...' % sub_cid) sys.exit(1) sub_city_pid = sub_city_info.get('parent_id') city_info = self.citys_list.get(sub_city_pid) if not city_info: LOG.info('@@@@@: %s is not have parent city information, exit...' % sub_cid) sys.exit(1) city_id = city_info.get('id') city_full_name = city_info.get('full_name') sub_city_id = sub_city_info.get('id') min_page, max_page, max_pagination, max_range = self.tyc_client.get_pagination(key, _type='sub_city', city_id=city_id, sub_city_id=sub_city_id, cityes=self.citys_list, sub_city_info=sub_city_info) max_pagination = int(max_pagination) LOG.info('[%s][%s][%s-%s]spider page: %s ~ %s ||| max_pagination: %s ||| max range: %s' % (RUN_MODE, key, city_full_name, sub_city_info.get('full_name'), min_page, max_page, max_pagination, max_range)) self._is_not_max_range_die(max_range) for i in range(0, max_range, 1): max_page = min_page + PAGINATION if max_page > max_pagination: max_page = max_pagination process.append( pool.apply_async(self.tyc_client.work_by_key, args=(key, min_page, max_page, 'city', self.q, city_id, sub_city_id, city_info, sub_city_info)) ) min_page = max_page + 1 pool.close() pool.join() while 1: try: if self.q.empty(): break self.ret_res_list.append(self.q.get_nowait()) except: pass self.to_store(self.keys, MIN_PAGE, MAX_PAGE)
def upload_image(): image = request.files.get('avatar') g.menuf = 'setter' g.menusub = 'user' try: form = request.form res = SetterService().upload_info(image, form) except Exception as e: LOG.error("setter>upload_info is error: %s" % e) res = Status(101, 'failure', u'Server发生错误,获取失败', {}).json() LOG.info('%s update information' % get_user_id()) return res
def to_store(self, keys, min_page, max_page, datas=None, excelname=None): _keys = list() if isinstance(keys, list): _keys = keys else: _keys.append(keys) _data = datas if datas else self.ret_res_list if STORE_EXCEL: if (min_page and max_page): to_excel_name = os.path.join(get_excel_folder(), '%s[%s]-%s[%s~%s].xls' % (get_now(), API_MODE, '_'.join(_keys), min_page, max_page)) else: to_excel_name = os.path.join(get_excel_folder(), '%s[%s]-%s[ALL].xls' % (get_now(), API_MODE, '_'.join(_keys))) self.excel_client.to_excel(_data, ATTRS_DICT, to_excel_name) LOG.info("Excel is finished[%s ~ %s]: %s" % (min_page, max_page, to_excel_name)) if STORE_DB: self.enterprise_service.adds(_data) LOG.info('DB is finished[%s ~ %s]: %s' % (min_page, max_page, '_'.join(_keys))) \ if (MIN_PAGE and MAX_PAGE) else LOG.info('DB is finished[ALL]: %s' % ('_'.join(_keys)))
def _single_by_key(self): for key in self.keys: if not key: continue min_page, max_page, max_pagination, max_range = self.tyc_client.get_pagination(key) self._is_not_max_range_die(max_range) LOG.info('[%s][%s]spider page: %s ~ %s ||| max_pagination: %s ||| max range: %s' % (RUN_MODE, key, min_page, max_page, max_pagination, max_range)) if not isinstance(max_pagination, int): max_pagination = int(max_pagination) for i in range(0, max_range, 1): max_page = PAGINATION + min_page if max_page > max_pagination: max_page = max_pagination self._print_info('[%s][%s]%s ~ %s' % (RUN_MODE, key, min_page, max_page)) _res = self.tyc_client.work_by_key(key, min_page, max_page) self.to_store(key, min_page, max_page, datas=_res) min_page = max_page + 1
def _process_by_key(self): pool = multiprocessing.Pool(processes=(MAX_CPU - 1 if MAX_CPU > 2 else 1)) LOG.info('Main process: %s, run cpu count: %s' % (os.getpid(), (MAX_CPU - 1 if MAX_CPU > 2 else 1))) process = list() for key in self.keys: if not key: continue min_page, max_page, max_pagination, max_range = self.tyc_client.get_pagination(key) LOG.info('[%s][%s]spider page: %s ~ %s ||| max_pagination: %s ||| max range: %s' % (RUN_MODE, key, min_page, max_page, max_pagination, max_range)) if not max_range: LOG.error("It's not have max range") sys.exit() for i in range(0, max_range, 1): max_page = min_page + PAGINATION if max_page > max_pagination: max_page = max_pagination process.append( pool.apply_async(self.tyc_client.work_by_key, args=(key, min_page, max_page, self.q, self.citys_list, self.sub_citys_mapping)) ) min_page = max_page + 1 pool.close() pool.join() while 1: try: if self.q.empty(): break self.ret_res_list.append(self.q.get_nowait()) except: pass self.to_store(self.keys, MIN_PAGE, MAX_PAGE)
def detail_by_url(self, comp_url: str, obj_id: str): print(self.count, comp_url, obj_id, '$' * 80) detail_res = dict() if not comp_url: return detail_res is_ok, search_resp = api_get(url=comp_url, headers=self.headers, data={}, resptype='text') if not is_ok: return detail_res soup = BeautifulSoup(search_resp, 'lxml') # header: 详情页 公司名称 title_list = soup.find_all('div', class_="header") et2 = etree.HTML(search_resp) # if not title_list: # return -1 try: company_name = (title_list[0].find_all( 'h1', class_="name"))[0].get_text() except: name = et2.xpath( '//*[@id="company_web_top"]/div[2]/div[3]/div[1]/h1/text()') company_name = ''.join(name) detail_res['company_name'] = company_name # 电话 更多联系方式 # print(et2.xpath('//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[1]/span[3]/script/text()'), 'OK '*80) origin_phone = et2.xpath( '//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[1]/span[3]/script/text()' ) # 邮箱 更多邮箱 # print(et2.xpath('//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[2]/span[3]/script/text()'), 'EMAIL '*80) origin_email = et2.xpath( '//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[2]/span[3]/script/text()' ) if origin_phone and origin_email: year_list = [i.get('showSource') for i in eval(origin_phone[0])] phone_item_vals = [ i.get('phoneNumber') for i in eval(origin_phone[0]) ] email_list = eval(origin_email[0]) contact_item = {} for contact in zip(year_list, phone_item_vals, email_list): contact_item['c_id'] = obj_id contact_item['company_name'] = detail_res.get( 'company_name', '') contact_item['report_year'] = contact[0] contact_item['phone'] = contact[1] contact_item['email'] = contact[-1] contact_item['date_time'] = datetime.now() bixao_phone_emial.find_one_and_update({'c_id': obj_id}, {'$set': contact_item}, upsert=True) # detail: 电话 邮箱 公司官网 地址 简介 detail_div = soup.find_all('div', class_="detail") def while_req(url): sub_is_ok, sub_search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') return sub_is_ok, sub_search_resp # 添加手动验证功能 if not detail_div: while 1: if is_ok and detail_div: break else: LOG.critical('验证############### %s ###############' % comp_url) random_sleep(20, 25) self.headers['Cookie'] = cookies_get() is_ok, search_resp = while_req(comp_url) soup = BeautifulSoup(search_resp, 'lxml') detail_div = soup.find_all('div', class_="detail") for div in detail_div[0].find_all('div'): if not div: continue # f0 电话 && 邮箱 if div.get('class') == ['f0']: for big_index, big_child in enumerate(div): if big_index == 0: for index, child in enumerate(big_child.children): if index == 1: detail_res['phone'] = child.get_text().strip( ) or '-' break elif big_index == 1: for index, child in enumerate(big_child.children): if index == 1: detail_res['email'] = child.get_text().strip( ) or '-' break else: break # 公司官网 && 地址 elif div.get('class') == ['f0', 'clearfix']: for big_index, big_child in enumerate(div): if big_index == 0: for index, child in enumerate(big_child.children): if index == 1: detail_res['company_url'] = child.get_text( ).strip() or '-' break elif big_index == 1: for index, child in enumerate(big_child.children): if index == 1: for small_index, small_child in enumerate( child.children): if small_index == 0: detail_res[ 'address'] = small_child.get_text( ).strip() or '-' break break else: break # 简介 elif div.get('class') == ['summary']: for big_index, big_child in enumerate(div): if big_index == 0: resume = big_child.string if resume: resume = resume.strip() detail_res['resume'] = resume or '-' break else: break else: continue # detail-list: detail_list_div = soup.find_all('div', class_="detail-list") if not detail_list_div: return detail_res detail_res['c_id'] = obj_id etc = etree.HTML(search_resp) for div in detail_list_div[0].find_all('div'): if not div: continue if div.get('tyc-event-ch' ) == 'CompangyDetail.gongshangxinxin': # 工商信息 for index_1, child_1 in enumerate( div.find_all('div', recursive=False)): if index_1 == 1: for index_1_1, child_1_1 in enumerate(child_1): if index_1_1 == 2: for index_tr, tr in enumerate( child_1_1.find_all('tr')): if index_tr == 0: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 注册资本 detail_res[ 'register_funds'] = td.get_text( ).strip() or '-' elif index_td == 3: # 实缴资金 detail_res[ 'paidin_funds'] = td.get_text( ).strip() or '-' elif index_tr == 1: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 成立日期 detail_res[ 'establish_date'] = td.get_text( ).strip() or '-' elif index_td == 3: # 经营状态 detail_res[ 'status'] = td.get_text( ).strip() or '-' elif index_tr == 2: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 统一社会信用代码 detail_res[ 'credit_code'] = td.get_text( ).strip() or '-' elif index_td == 3: # 工商注册号 detail_res[ 'registration_number'] = td.get_text( ).strip() or '-' elif index_tr == 3: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 纳税人识别号 detail_res[ 'identification_number'] = td.get_text( ).strip() or '-' elif index_td == 3: # 组织机构代码 detail_res[ 'organization_code'] = td.get_text( ).strip() or '-' elif index_tr == 4: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 公司类型 detail_res[ 'company_type'] = td.get_text( ).strip() or '-' elif index_td == 3: # 行业 detail_res[ 'industry'] = td.get_text( ).strip() or '-' elif index_tr == 6: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 营业期限 detail_res[ 'business_term'] = td.get_text( ).strip() or '-' elif index_td == 3: # 纳税人资质 detail_res[ 'taxpayer_qualification'] = td.get_text( ).strip() or '-' elif index_tr == 7: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 人员规模 detail_res[ 'personnel_size'] = td.get_text( ).strip() or '-' elif index_td == 3: # 参保人数 detail_res[ 'insured_num'] = td.get_text( ).strip() or '-' elif index_tr == 9: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 注册地址 detail_res[ 'registered_address'] = td.get_text( ).strip() or '-' elif index_tr == 10: for index_td, td in enumerate( tr.find_all('td')): if index_td == 1: # 经营范围 detail_res[ 'business_scope'] = td.get_text( ).strip() or '-' break continue elif div.get( 'tyc-event-ch') == 'CompangyDetail.zhuyaorenyuan': # 主要人员 people_item = {} people_item['c_id'] = obj_id people_item['company_name'] = detail_res.get( 'company_name', '') # 姓名 people_item['name'] = etc.xpath( '//*[@id="_container_staff"]/div/table/tbody/tr[1]/td[2]/table/tbody/tr/td[2]/a/text()' )[0] # 职位 people_item['position'] = etc.xpath( '//*[@id="_container_staff"]/div/table/tbody/tr[1]/td[3]/span/text()' )[0] bixiao_people.find_one_and_update({'c_id': obj_id}, {'$set': people_item}, upsert=True) print(people_item) for people_vals in people_item: if not people_item[people_vals]: LOG.info(f'主要人员数据匹配异常:{people_item}, 请求地址:{comp_url}') elif div.get( 'tyc-event-ch') == 'CompangyDetail.gudongxinxi': # 股东信息 capital_item = {} capital_item['c_id'] = obj_id capital_item['company_name'] = detail_res.get( 'company_name', '') # 股东名称 title = etc.xpath( '//*[@id="_container_holder"]/table/tbody/tr[1]/td[2]/table/tbody/tr/td[2]/a/text()' ) # 标签 label = etc.xpath( '//*[@id="_container_holder"]/table/tbody/tr[1]/td[2]/table/tbody/tr/td[2]/div/span/text()' ) # 持股比例 has_rates = etc.xpath( '//*[@id="_container_holder"]/table/tbody/tr[1]/td[3]/div/div/span/text()' ) # 认缴出资额 subscribed_capital = etc.xpath( '//*[@id="_container_holder"]/table/tbody/tr[1]/td[4]/div/span/text()' ) capital_item['title'] = ''.join(title) capital_item['label'] = ''.join(label) capital_item['has_rates'] = ''.join(has_rates) capital_item['subscribed_capital'] = ''.join( subscribed_capital) bixiao_shareholder.find_one_and_update({'c_id': obj_id}, {'$set': capital_item}, upsert=True) print(capital_item, 'C' * 80) elif div.get( 'tyc-event-ch') == 'CompangyDetail.findNewsCount': # 新闻舆情 news_item = {} news_item['c_id'] = obj_id news_item['company_name'] = detail_res.get('company_name', '') # 标题 news_item['title'] = etc.xpath( '//*[@id="_container_findNewsCount"]/div[1]/div[1]/div[1]/div[1]/a/text()' )[0] # 内容地址 news_item['info_url'] = etc.xpath( '//*[@id="_container_findNewsCount"]/div[1]/div[1]/div[1]/div[1]/a/@href' )[0] # 来源 news_item['source'] = etc.xpath( '//*[@id="_container_findNewsCount"]/div[1]/div[1]/div[1]/div[3]/span[1]/text()' )[0] # 发布时间 news_item['date_doc'] = etc.xpath( '//*[@id="_container_findNewsCount"]/div[1]/div[1]/div[1]/div[3]/span[2]/text()' )[0] print(news_item) bixiao_news.update({'c_id': obj_id}, {'$set': news_item}, upsert=True) for news_vals in news_item: if not news_item[news_vals]: LOG.info(f'新闻舆情数据匹配异常:{news_item}, 请求地址:{comp_url}') elif div.get('tyc-event-ch') == 'CompangyDetail.chanpin': # 产品信息 product_item = {} product_item['c_id'] = obj_id product_item['company_name'] = detail_res.get( 'company_name', '') # 产品名称 product_item['name'] = etc.xpath( '//*[@id="_container_product"]/table/tbody/tr[1]/td[2]/table' '/tbody/tr/td[2]/span/text()')[0] # 产品简称 product_item['short_name'] = etc.xpath( '//*[@id="_container_product"]/table/tbody/tr[1]/td[3]' '/span/text()')[0] # 产品分类 product_item['type'] = etc.xpath( '//*[@id="_container_product"]/table/tbody/tr[1]/td[4]/span' '/text()')[0] # 领域 product_item['domain'] = etc.xpath( '//*[@id="_container_product"]/table/tbody/tr[1]/td[5]' '/span/text()')[0] print(product_item) bixiao_product.find_one_and_update({'c_id': obj_id}, {'$set': product_item}, upsert=True) for product_vals in product_item: if not product_item[product_vals]: LOG.info(f'产品信息数据匹配异常:{product_item}, 请求地址:{comp_url}') elif div.get('tyc-event-ch') == 'CompangyDetail.zhaopin': # 招聘信息 recruit_item = {} recruit_item['c_id'] = obj_id recruit_item['company_name'] = detail_res.get( 'company_name', '') recruit_item['opd_date'] = etc.xpath( '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[2]' '/text()')[0] recruit_item['position_'] = etc.xpath( '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[3]' '/text()')[0] recruit_item['month_salary'] = etc.xpath( '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[4]' '/text()')[0] recruit_item['education'] = etc.xpath( '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[5]' '/text()')[0] recruit_item['work_experience'] = etc.xpath( '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[6]' '/text()')[0] recruit_item['address'] = etc.xpath( '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[7]' '/text()')[0] print(recruit_item, 'P' * 80) bixiao_recruit.find_one_and_update({'c_id': obj_id}, {'$set': recruit_item}, upsert=True) for recruit_vals in recruit_item: if not recruit_item[recruit_vals]: LOG.info(f'招聘信息数据匹配异常:{recruit_item}, 请求地址:{comp_url}') elif div.get('tyc-event-ch' ) == 'CompangyDetail.lishiwangzhanbeian': # ICP备案 record_item = {} record_item['c_id'] = obj_id record_item['company_name'] = detail_res.get( 'company_name', '') record_item['opd_date'] = etc.xpath( '//*[@id="_container_pastIcpList"]/table/tbody/tr/td[2]' '/span/text()')[0] record_item['web_name'] = etc.xpath( '//*[@id="_container_pastIcpList"]/table/tbody/tr/td[3]' '/span/text()')[0] record_item['index_url'] = etc.xpath( '//*[@id="_container_pastIcpList"]/table/tbody/tr/td[4]/div/' 'a/@href')[0] record_item['domain_name'] = etc.xpath( '//*[@id="_container_pastIcpList"]/table/tbody/tr/td[5]' '/text()')[0] record_item['website_filing'] = etc.xpath( '//*[@id="_container_pastIcpList"]/table/tbody/tr/td[6]/' 'span/text()')[0] print(record_item, 'M' * 80) bixiao_record_icp.find_one_and_update({'c_id': obj_id}, {'$set': record_item}, upsert=True) for record_vals in record_item: if not record_item[record_vals]: LOG.info(f'ICP备案数据匹配异常:{record_item}, 请求地址:{comp_url}') print(detail_res, '%' * 80) bixiao_business.find_one_and_update({'c_id': obj_id}, {'$set': detail_res}, upsert=True) return detail_res
def add_or_edit_empl(self, args): """ add employee :param args: form parameters :return: """ new_args = dict() for k, v in args.items(): if isinstance(k, unicode): k = k.encode('utf-8') if v and isinstance(v, unicode): v = v.encode('utf-8') if k not in self.request_add_attrs: return Status(202, 'failure', u'%s参数不合法' % k, {}).json() if k in self.request_not_need_attrs: new_args[k] = str(v) continue if k and not v: attr_name = self.employee_attrs_dict.get(k) return Status(203, 'failure', u'%s内容需要进行填写' % attr_name, {}).json() new_args[k] = v card_id = args.get('card_id') is_add = new_args['is_add'] china_name = args.get('china_name') if isinstance(card_id, unicode): card_id = card_id.encode('utf-8') exist_empl_mode = self.employee_bo.get_empl_by_card_id(card_id) if is_add in ['1', 1] and exist_empl_mode: return Status(204, 'failure', u'%s用户已存在,无需重新建立信息档案' % china_name, {}).json() empl_mode = self.employee_bo.new_mode() if is_add == '1' \ else exist_empl_mode # submit for attr in self.request_add_attrs: if not attr: continue if attr == 'china_name': empl_mode.china_name = new_args.get(attr) elif attr == 'english_name': empl_mode.english_name = new_args[attr] elif attr == 'email': empl_mode.email = new_args[attr] elif attr == 'phone': empl_mode.phone = new_args[attr] elif attr == 'entry_date': empl_mode.entry_date = new_args[attr] elif attr == 'sex': empl_mode.sex = new_args[attr] elif attr == 'nation': empl_mode.nation = new_args[attr] elif attr == 'birth_date': empl_mode.birth_date = new_args[attr] elif attr == 'political_status': empl_mode.political_status = new_args[attr] elif attr == 'nationality': empl_mode.nationality = new_args[attr] elif attr == 'residence_type': empl_mode.residence_type = new_args[attr] elif attr == 'education': empl_mode.education = new_args[attr] elif attr == 'marriage': empl_mode.marriage = new_args[attr] elif attr == 'card_type': empl_mode.card_type = new_args[attr] elif attr == 'card_id': empl_mode.card_id = new_args[attr] elif attr == 'card_deadline': empl_mode.card_deadline = new_args[attr] elif attr == 'card_place': empl_mode.card_place = new_args[attr] elif attr == 'current_address': empl_mode.current_address = new_args[attr] elif attr == 'bank_type': empl_mode.bank_type = new_args[attr] elif attr == 'bank_country': empl_mode.bank_country = new_args[attr] elif attr == 'bank_city': empl_mode.bank_city = new_args[attr] elif attr == 'bank_id': empl_mode.bank_id = new_args[attr] elif attr == 'bank_name': empl_mode.bank_name = new_args[attr] elif attr == 'status': empl_mode.status = new_args[attr] if new_args[attr] else '1' # record if is_add in [1, '1']: empl_mode.entry_submit_rtx = get_user_id() empl_mode.entry_submit_time = get_now() else: empl_mode.last_update_rtx = get_user_id() empl_mode.last_update_time = get_now() self.employee_bo.add_model(empl_mode) if is_add == '1' \ else self.employee_bo.merge_model(empl_mode) if is_add == '1': LOG.info("%s add employee is success" % card_id) return Status(100, 'success', u'新增%s成功' % china_name, {}).json() LOG.info("%s edit employee is success" % card_id) return Status(110, 'success', u'%s信息编辑成功' % china_name, {}).json()
def get_all(self, args): data = dict() new_args = dict() start = 0 for k, v in args.items(): if k not in self.request_attrs: return Status(202, 'failure', u'%s参数不合法' % k, data).json() if k == 'start': start = int(v) new_args[k] = start elif k == 'search': if isinstance(v, unicode): v = v.encode('utf-8') new_args[k] = "%" + str(v) + "%" else: new_args[k] = v # start = (int(new_args['index']) - 1) * int(new_args.get('limit')) # new_args['start'] = start # status 任职状态 1在职 2离职 all_empls, count = self.employee_bo.get_all(new_args, status=1) data = dict() LOG.info('employee>api_list: %s' % count) if not all_empls: data['totalCount'] = 0 data['datalist'] = [] return Status(101, 'failure', u'成功,但数据为空', data).json() results = list() for empl in all_empls: if not empl: continue result = dict() for attr in self.employee_show_attrs: params = dict() if attr == 'id': result[attr] = start + 1 elif attr == 'china_name': result[attr] = empl.china_name elif attr == 'english_name': result[attr] = empl.english_name elif attr == 'nationality': nationality = empl.nationality if not nationality: result[attr] = empl.nationality else: params['enum_type'] = attr params['enum_subid'] = empl.nationality result[attr] = self.enums_bo.get_enumname_by_params( params) elif attr == 'sex': sex = empl.sex if not sex: result[attr] = empl.sex else: params['enum_type'] = attr params['enum_subid'] = empl.sex result[attr] = self.enums_bo.get_enumname_by_params( params) elif attr == 'birth_date': result[attr] = d2s(empl.birth_date, fmt="%Y-%m-%d") \ if empl.birth_date else '' elif attr == 'entry_date': result[attr] = d2s(empl.entry_date, fmt="%Y-%m-%d") \ if empl.entry_date else '' elif attr == 'email': result[attr] = empl.email elif attr == 'phone': result[attr] = empl.phone elif attr == 'card_id': result[attr] = empl.card_id start += 1 results.append(result) data['totalCount'] = count data['datalist'] = results return Status(100, 'success', u'成功', data).json()
def login_out(): user_id = get_user_id() if user_id: LOG.info('%s login out ==========' % user_id) session.clear() return redirect(url_for('manage.index'))
def work_by_key(self, key, min_page, max_page, type='default', queue=None, cid=None, sub_cid=None, city_info=None, sub_city_info=None): ret_res = list() if not key: LOG.error("【%s】key is null, no work." % RUN_MODE) return ret_res # page for page in range(min_page, max_page + 1, 1): if API_MODE == 'tyc' and type == 'default': url = '%s/p%s?key=%s' % (TYC_SEARCH_API, page, parse.quote(key)) elif API_MODE == 'tyc' and type == 'city': url = '%s/p%s?key=%s&base=%s' % (TYC_SEARCH_API, page, parse.quote(key), city_info.get('name')) elif API_MODE == 'tyc' and type == 'sub_city': if cid in ZXS_CITY_IDS: url = '%s/p%s?key=%s&base=%s&areaCode=%s' % ( TYC_SEARCH_API, page, parse.quote(key), sub_city_info.get('name'), sub_city_info.get('code')) else: url = '%s/p%s?key=%s&base=%s' % (TYC_SEARCH_API, page, parse.quote(key), sub_city_info.get('name')) elif API_MODE == 'pro' and type == 'default': url = '%s/p%s?key=%s' % (TYC_PRO_SEARCH_API, page, parse.quote(key)) elif API_MODE == 'pro' and type == 'city': url = '%s/p%s?key=%s&base=%s' % (TYC_PRO_SEARCH_API, page, parse.quote(key), city_info.get('name')) elif API_MODE == 'pro' and type == 'sub_city': if cid in ZXS_CITY_IDS: url = '%s/p%s?key=%s&base=%s&areaCode=%s&baseArea=%s' \ % (TYC_PRO_SEARCH_API, page, parse.quote(key), city_info.get('name'), sub_city_info.get('code'), parse.quote(sub_city_info.get('name'))) else: url = '%s/p%s?key=%s&base=%s' % (TYC_PRO_SEARCH_API, page, parse.quote(key), sub_city_info.get('name')) else: LOG.critical('====== API_MODE is not in [tyc, pro] ======') sys.exit(1) LOG.info('%s[%s]%s' % (key, API_MODE, url)) self.headers['Referer'] = url is_ok, search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') if not is_ok: continue if self.check_no(url, _type='page'): continue soup = BeautifulSoup(search_resp, 'lxml') tags = soup.find_all( 'a', attrs={"tyc-event-ch": "CompanySearch.Company"}) def while_req(url): sub_is_ok, sub_search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') return sub_is_ok, sub_search_resp # 添加手动验证功能 if len(tags) == 0: while 1: if is_ok and len(tags) > 0: break else: LOG.critical('验证############### %s ###############' % url) random_sleep(20, 25) is_ok, search_resp = while_req(url) soup = BeautifulSoup(search_resp, 'lxml') tags = soup.find_all( 'a', attrs={"tyc-event-ch": "CompanySearch.Company"}) for tag in tags: if not tag or not tag.attrs.get('href'): continue res_dict = dict() if API_MODE == 'tyc': tyc_url = tag.get('href').strip() elif API_MODE == 'pro': tyc_url = '%s%s/background' % (TYC_PRO_DETAIL_API, tag.get('href').strip()) else: tyc_url = '' res_dict['tyc_url'] = tyc_url res_dict['name'] = tag.get_text().strip() res_dict['key'] = key res_dict['is_send_email'] = False res_dict['city'] = city_info.get( 'full_name') if city_info else '-' res_dict['sub_city'] = sub_city_info.get( 'full_name') if sub_city_info else '-' detail_res = list() if API_MODE == 'tyc': detail_res = self.detail_by_url(res_dict.get('tyc_url')) elif API_MODE == 'pro': detail_res = self.detail_pro_by_url( res_dict.get('tyc_url')) res_dict.update(detail_res) print('%s[%s] %s' % (res_dict['name'], str(True if res_dict else False), res_dict['tyc_url'])) ret_res.append(res_dict) if queue: queue.put(res_dict) random_sleep(3.5, 4.5) if IS_TEST_BREAK: break if IS_TEST_BREAK: break return ret_res
def init_run(self): LOG.debug('Server is initializing......') self._autoinit_register_blueprint() LOG.info('Web server is running......')
def __init__(self, app): """ Initialize webFlaskServer instance and flask configuration """ self.app = app if not self.app: LOG.info('Web server initialize is failure......') sys.exit(1) _realpath = os.path.dirname(os.path.realpath(__file__)) self.app.template_folder = _realpath + '/templates/' self.app.secret_key = SECRET_KEY or 'python' self.app.static_folder = _realpath + '/static' self.app.static_url_path = '/static' self.app.add_url_rule(self.app.static_url_path + '/<path:filename>', endpoint='static', view_func=self.app.send_static_file) super(WebFlaskServer, self).__init__() @self.app.before_request def before_request(): if get_user_id(): return # api: rest apis # manage: login apis if request.blueprint in ['api', 'manage', None]: return # special api for blueprints if request.endpoint.endswith('ForApi') or \ request.endpoint.endswith('for_api'): return return redirect(url_for('manage.index')) @self.app.before_first_request def before_first_request(): g._session = get_session() @self.app.errorhandler(404) def not_found_error(error): LOG.error("%s is not found 404" % request.url) return render_template('errors/404.html', ), 404 @self.app.errorhandler(500) def server_error(error): LOG.error("%s is server error 500" % request.url) return render_template('errors/500.html'), 500 @self.app.context_processor def default_context_processor(): user_id = session.get('user_id') menu = current_user = dict() if hasattr(g, 'menuf'): menu['f'] = g.menuf or 'index' if hasattr(g, 'menusub'): menu['sub'] = g.menusub or 'index' if user_id: current_user = SysUserService().get_user_by_params(user_id) return { 'current_user': current_user, 'sysversion': VERSION, 'menu': menu } # set favicon @self.app.route('/favicon.ico') def get_defaule_favicon(): return self.app.send_static_file('images/favicon.ico')
def work_by_key(self, key, min_page=0, max_page=5, queue=None): ret_res = list() if not key: LOG.error("【%s】key is null, no work." % RUN_MODE) return ret_res if not min_page: min_page = self.MIN_PAGE if not max_page: max_page = self.MAX_PAGE LOG.info('%s[%s ~ %s]' % (key, min_page, max_page)) # page for page in range(min_page, max_page, 1): if API_MODE == 'tyc': url = '%s/p%s?key=%s' % (TYC_SEARCH_API, page, parse.quote(key)) elif API_MODE == 'pro': url = '%s/p%s?key=%s' % (TYC_PRO_SEARCH_API, page, parse.quote(key)) else: LOG.critical('====== API_MODE is not in [tyc, pro] ======') sys.exit(1) LOG.info('%s[%s]%s' % (key, API_MODE, url)) is_ok, search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') if not is_ok: continue soup = BeautifulSoup(search_resp, 'lxml') tags = soup.find_all( 'a', attrs={"tyc-event-ch": "CompanySearch.Company"}) def while_req(url): sub_is_ok, sub_search_resp = api_get(url=url, headers=self.headers, data={}, resptype='text') return sub_is_ok, sub_search_resp # 添加手动验证功能 if len(tags) == 0: while 1: if is_ok and len(tags) > 0: break else: LOG.critical('验证############### %s ###############' % url) random_sleep(20, 25) is_ok, search_resp = while_req(url) soup = BeautifulSoup(search_resp, 'lxml') tags = soup.find_all( 'a', attrs={"tyc-event-ch": "CompanySearch.Company"}) for tag in tags: if not tag or not tag.attrs.get('href'): continue res_dict = dict() if API_MODE == 'tyc': tyc_url = tag.get('href').strip() elif API_MODE == 'pro': tyc_url = '%s%s/background' % (TYC_PRO_DETAIL_API, tag.get('href').strip()) else: tyc_url = '' res_dict['tyc_url'] = tyc_url res_dict['name'] = tag.get_text().strip() res_dict['key'] = key detail_res = list() if API_MODE == 'tyc': detail_res = self.detail_by_url(res_dict.get('tyc_url')) elif API_MODE == 'pro': detail_res = self.detail_pro_by_url( res_dict.get('tyc_url')) res_dict.update(detail_res) print('%s[%s] %s' % (res_dict['name'], str(True if res_dict else False), res_dict['tyc_url'])) ret_res.append(res_dict) if queue: queue.put(res_dict) random_sleep(3.2, 4.5) if IS_TEST_BREAK: break if IS_TEST_BREAK: break return ret_res
def start(): LOG.info('%s run start [IS TEST RUN: %s]......' % (NAME, IS_TEST_BREAK)) SpiderTYCClass().init_run() LOG.info('%s run end [IS TEST RUN: %s]......' % (NAME, IS_TEST_BREAK))
def _print_info(self, message): LOG.info('=' * 20 + message + '=' * 20)