def __init__(self, type): self.headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Content-Type": "application/json; charset=UTF-8", "X-Requested-With": "XMLHttpRequest", "User-Agent": random.choice(USER_AGENTS), "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Host ": "www.tianyancha.com" } self.redisClient = single_reids self.tyc = RequestClass(proxies=proxies) self.mark_count = self.login() self.result_dicts = {} self.type = type
def __init__(self): self.headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Content-Type": "application/json; charset=UTF-8", "X-Requested-With": "XMLHttpRequest", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3486.0 Safari/537.36", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Host ": "www.tianyancha.com" } # self.redisClient = RedisUtil() self.tyc = RequestClass(proxies=proxies) self.username = '' self.mark_count = self.login() self.output_img = ''
class ReqCompany(object): def __init__(self, type): self.headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Content-Type": "application/json; charset=UTF-8", "X-Requested-With": "XMLHttpRequest", "User-Agent": random.choice(USER_AGENTS), "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Host ": "www.tianyancha.com" } self.redisClient = single_reids self.tyc = RequestClass(proxies=proxies) self.mark_count = self.login() self.result_dicts = {} self.type = type def get_dict(self, font_url): logger.error('字典url={}'.format(font_url)) dicts = {} mm = self.tyc.session.get( 'https://static.tianyancha.com/fonts-styles/fonts/{}/tyc-num.woff'. format(font_url), headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3486.0 Safari/537.36' }, proxies=proxies) # with open('./fonts.woff', "wb") as f: # f.write(mm.content) tt = ttLib.TTFont(BytesIO(mm.content)) names = tt.getGlyphNames()[:10] orders = tt.getGlyphOrder()[2:12] for i in range(len(names)): dicts[orders[i]] = names[i] return dicts def login(self): self.cookie = self.tyc.login() def get_dict(self, font_url): logger.error('字典url={}'.format(font_url)) dicts = {} # font_url = '88/88b54b20' mm = self.tyc.session.get( 'https://static.tianyancha.com/fonts-styles/fonts/{}/tyc-num.woff'. format(font_url), headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3486.0 Safari/537.36' }) with open('./fonts.woff', "wb") as f: f.writelines(mm.content) from fontTools import ttLib # tt = ttLib.TTFont("fonts.woff") tt = ttLib.TTFont(BytesIO(mm.content)) names = tt.getGlyphNames()[:10] orders = tt.getGlyphOrder()[2:12] logger.error(names) logger.error(tt.getGlyphNames()) logger.error(tt.getGlyphOrder()) logger.error(orders) for i in range(len(names)): dicts[orders[i]] = names[i] return dicts def get_company_list(self, company_name): key = company_name # 网页抓取 logger.error("Search Company %s") url = "https://www.tianyancha.com/search?key=%s&checkFrom=searchBox" % urllib.quote( key) try: con = self.tyc.request_url(url=url, headers=self.headers) # 判断账号是否被封 if con.status_code != 403: if u"请输入您的手机号码" in con.text or u"我们只是确认一下你不是机器人" in con.text or con is None or con.status_code != 200: logger.error("userName: {} forbid ".format(self.username)) # logger.error ("userName: %s forbid " % con.text) single_reids.server.lpush('forbids', self.cookie) self.login() raise requests.exceptions.ProxyError try: # 解析公司列表页面,搜索信息存入mysql,并返回公司详情url self.parse_company_list(company_name, con.text) logger.error(self.result_dicts) return self.result_dicts except Exception as e: logger.error(e) # self.mysqlClient.mysql_update(cache['table'], {'searched': 1, 'error': 1}, # {'company_number': cache['number']}) logger.error("Exception Logged{}".format(e)) else: logger.error("IP is forbid!") # time.sleep(60 * 60) raise requests.exceptions.ProxyError except requests.exceptions.ProxyError as error: logger.error(error) logger.error("Proxy Error! {}".format(error)) self.get_company_list(company_name) except Exception as e: logger.error(e) self.get_company_list(company_name) def parse_company_list(self, company_name, text=''): """ 如果第一行公司名称与输入key一致,则取第一行,否则遍历第一页,如果第一页都没有与输入key一致的,则还取第一行 解析公司列表页面,存储基本信息到mysql,并返回公司名称和公司详情连接 :param key: 关键字 :param text: 网页内容 :return: ent_name, url """ # logger.error('parse_company_list()') logger.error("parse_company_list() {}".format(company_name)) key = company_name soup = BeautifulSoup(text, 'lxml') etree_xpath = etree.HTML(text) result_count = etree_xpath.xpath('//span[@class="tips-num"]/text()') # div_first = soup.find('div', class_="search_result_single") result_count div_first = soup.find('div', class_="search-item sv-search-company") div_none = soup.find('div', class_="result-list no-result") # print div_first.text.encode('utf8') # sdfsdfsfdsf if div_first: # print '有div_first' # ent_name = div_first.find('div', class_='header').find('a', class_="name").text.strip() company_divs = soup.find_all('div', class_="search-result-single") # ent_name = company_divs[0].xpath('.//img/@alt')[0] ent_name = company_divs[0].img['alt'] ent_name = ent_name.replace('<em>', '').replace('</em>', '').strip() if ent_name: ent_name = ent_name.encode('utf-8') if self.type == 0: if key == ent_name: pass else: # div_first = soup.find_all('div', class_="search-result-single") for company_div in company_divs: ent_name = company_div.img['alt'] ent_name = ent_name.replace('<em>', '').replace('</em>', '').strip() if key == ent_name: div_first = company_div break try: header_div = div_first.find('div', class_='header') # ent_name = header_div.find('a', class_="name").text.strip() if ent_name: ent_name = ent_name.encode('utf-8') url = header_div.find('a', class_="name")["href"].strip() self.get_detail(url, ent_name) except Exception as e: logger.error(e) # self.mysqlClient.mysql_update(cache['table'], {'searched': 1, 'error': 1}, # {'company_number': cache['number']}) logger.error("Exception Logged: {}".format(e)) else: logger.error(soup.prettify()) logger.error("can't find this company--{}".format(key)) # logger.error('无公司', key) return 'None' # self.mysqlClient.mysql_update(cache['table'], {'searched': 1, 'error': 2}, # {'company_number': cache['number']}) def get_detail(self, url, key): """ 爬取公司详情页面,存入mongodb,解析详情并存入mysql :param ent_name: 公司名称 :param url: 公司详情url :return: """ logger.error('get_detail() {}'.format(key)) try: con = self.tyc.request_url(url=url, headers=self.headers) if con.status_code != 403: if u"请输入您的手机号码" in con.text or u"我们只是确认一下你不是机器人" in con.text or con is None or con.status_code != 200: self.login() raise requests.exceptions.ProxyError single_reids.server.lpush('forbids', self.cookie) self.parse_detail(con.text, key) else: logger.error("IP is forbid!") raise requests.exceptions.ProxyError except requests.exceptions.ProxyError as error: logger.error(error) self.get_detail(key, url) logger.error("Proxy Error!", error) except Exception as e: logger.error(e) logger.error("Login false!") logger.error("Exception Logged {}".format(e)) self.get_detail(key, url) def parse_detail(self, text='', key=''): re_fonts = re.compile( r'<link rel="stylesheet" href="https://static.tianyancha.com/fonts-styles/css/(.*?)/font.css">' ) font_uri = re_fonts.search(text).group(1) dicts = self.get_dict(font_uri) logger.error("Parse detail info 基本信息 ") selectors = etree.HTML(text) top = (selectors.xpath('//div[@id="company_web_top"]')) baseinfo = {} table_lists = (selectors.xpath( '//div[contains(@id,"_container_baseInfo")]//table')) if top: top = top[0] email = top.xpath('.//span[@class="email"]/text()') email = email[0] if email else '-' detail_basic = top.xpath( './/div[@class="box -company-box "]//div[@class="content"]//div[contains(@class,"detail")]' ) if detail_basic: detail_basic = detail_basic[0] telephone = detail_basic.xpath( './div[position()=1]/div[position()=1]/span[2]/text()')[0] telephone = telephone urls = detail_basic.xpath( './div[position()=2]//a[@class="company-link"][position()=1]/text()' ) url = '' or urls[0] if urls else 'NA' if table_lists: trs0 = table_lists[0].xpath('./tbody//tr') trs1 = table_lists[1].xpath('./tbody//tr') if trs0: # dict_list registerFund = trs1[0].xpath('./td[2]//@title')[0] or 'NA' companyStatus = trs1[1].xpath('./td[2]//text()')[0] registerNum = trs1[1].xpath('./td[4]//text()')[0] tissueNum = trs1[2].xpath('./td[position()=4]/text()')[0] creditNum = trs1[2].xpath('./td[position()=2]/text()')[0] companyType = trs1[3].xpath('./td[position()=4]/text()')[0] taxpayerNum = trs1[3].xpath('./td[position()=2]/text()')[0] industry = trs1[4].xpath('./td[position()=4]/text()')[0] businessTerm = trs1[4].xpath( './td[position()=2]/span/text()')[0] registerDate = trs1[0].xpath( './td[position()=4]//text()')[0] or '' checkDate = trs1[5].xpath('./td[position()=4]//text()')[0] regDate = ''.join(registerDate.strip('-')) checDate = ''.join(checkDate.strip('-')) if dicts: if regDate > '2019' or regDate < '1950': registerDate = decode_dict_date( registerDate, dicts) if checDate > '2019' or checDate < '1950': checkDate = decode_dict_date(checkDate, dicts) businessTerm = businessTerm registerDate = registerDate checkDate = checkDate registerOffice = trs1[7].xpath( './td[position()=4]//text()')[0] englishName = trs1[8].xpath( './td[position()=4]//text()')[0].replace("'", '"') registerSite = trs1[8].xpath( './td[position()=2]//text()')[0] businessScope = trs1[9].xpath( './td[position()=2]/span//text()') businessScope = businessScope[0].replace( "'", '') if businessScope else '-' # 新增 纳税人资质 taxQualificate = trs1[5].xpath( './td[position()=2]//text()') taxQualificate = taxQualificate[ 0] if taxQualificate else '-' # 人员规模 persionSize = trs1[6].xpath( './td[position()=4]//text()')[0] # 实缴资本: paidCapital = trs1[6].xpath( './td[position()=2]//text()')[0] # 参保人数: insuredPersion = trs1[7].xpath( './td[position()=2]//text()')[0] entName = key # baseinfo['COMPANYSTATUS'] = trs0[2].xpath('./td/div[position()=2]/text()')[0] baseinfo['REGISTERFUND'] = registerFund.split( u'万')[0] + '0000' if u'人民币' in registerFund: baseinfo['CURRENCY'] = 'CNY' elif u'美元' in registerFund: baseinfo['CURRENCY'] = 'USD' # baseinfo['TISSUENUM'] = trs1[0].xpath('./td[position()=4]/text()')[0] baseinfo['CREDITNUM'] = creditNum baseinfo['COMPANYTYPE'] = companyType baseinfo['LOCALTAXPAYERNUM'] = taxQualificate baseinfo['COUNTRYTAXPAYERNUM'] = taxQualificate baseinfo['INDUSTRY'] = industry BUSINESSTERM = businessTerm baseinfo['STARTDATE'] = BUSINESSTERM[:10] baseinfo['VALIDATEDATE'] = BUSINESSTERM[-10:] registerDate = registerDate baseinfo['REGISTERDATE'] = decode_dict_date( registerDate, dicts) # baseinfo['checkDate'] = decode_dict_date(checkDate, dicts) # baseinfo['ENGLISHNAME'] = trs1[6].xpath('./td[position()=4]/text()')[0] baseinfo['REGISTERSITE'] = registerSite baseinfo['ADDRESS'] = baseinfo['REGISTERSITE'] baseinfo['BUSINESSSCOPE'] = businessScope baseinfo['ENTNAME'] = entName # logger.error(baseinfo) self.result_dicts = baseinfo
class ReqCompany(object): def __init__(self): self.headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Content-Type": "application/json; charset=UTF-8", "X-Requested-With": "XMLHttpRequest", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3486.0 Safari/537.36", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Host ": "www.tianyancha.com" } # self.redisClient = RedisUtil() self.tyc = RequestClass(proxies=proxies) self.username = '' self.mark_count = self.login() self.output_img = '' def login(self): self.username, self.cookie = self.tyc.login() return 0 def check(self): url = "https://www.tianyancha.com/search?key=%s&checkFrom=searchBox" % urllib.quote( '北京小度信息科技有限公司') try: con = self.tyc.session.get(url=url, headers=self.headers, proxies=proxies) # 判断账号是否被封 if con.status_code != 403: if u"请输入您的手机号码" in con.text or u"我们只是确认一下你不是机器人" in con.text or con is None or con.status_code != 200: print("userName: {} forbid ".format(self.username)) return True except requests.exceptions.ProxyError as error: print(error) # logger.info("Proxy Error!", error) except Exception as e: print(e) self.check() time.sleep(1) return False def varify_code(self): rc = RClient('feiniubuke', 'feiniubuke123', '1', 'b40ffbee5c1cf4e38028c197eb2fc751') im = open(self.output_img, 'rb').read() resut_tuple = rc.rk_create(im, 6900) return resut_tuple def get_company_list(self, company_name): key = company_name satrTime = int(time.time() * 1000) # 网页抓取 try: if self.check(): # print("Search Company %s") url = verify_url data_con = self.tyc.session.get(url=data_id_url, headers=data_id_heders, proxies=proxies) data_id = data_con.json()['data']['id'] targetImage_base64 = data_con.json()['data']['targetImage'] bgImage = data_con.json()['data']['bgImage'] targetImage = base64.b64decode(targetImage_base64) bgImage = base64.b64decode(bgImage) baseimg = Image.open(BytesIO(bgImage)) sz = baseimg.size basemat = np.atleast_2d(baseimg) im = Image.open(BytesIO(targetImage)) # resize to same width sz2 = im.size if sz2 != sz: im = im.resize((sz[0], round(sz2[0] / sz[0] * sz2[1])), Image.ANTIALIAS) mat = np.atleast_2d(im) basemat = np.append(basemat, mat, axis=0) self.output_img = 'merge{}.png'.format(str(os.getpid()) + '-') print(self.output_img) report_img = Image.fromarray(basemat).convert('L').save( self.output_img) # report_img.save( self.output_img) # from PIL import Image # img = Image.open( self.output_img).convert('L') # img.save(self.output_img) # picture = cv2.imread('merge.png', cv2.IMREAD_COLOR) # picture = cv2.cvtColor(picture, cv2.COLOR_BGR2GRAY) # picture = picture[int(top):int(bottom), int(left):int(right)] # cv2.imwrite('merge.png', picture) print('已获取验证码图片。。。') results = self.varify_code() print('已获取验证码坐标。。。') # results=u'24,49.258,83' print(results) results_json = str(results['Result']) results = results_json.split('.') list_str = '[' for i in results: list_str += '{%22x%22' + ':' + str(i.split(',')[0]) list_str += ',%22y%22' + ':' + str(i.split(',')[1]) + "}," list_str = list_str[:-1] + ']' # print(list_str) endTime = int(time.time() * 1000) submit_url = 'https://antirobot.tianyancha.com/captcha/checkCaptcha.json?captchaId={dataId}&clickLocs={lists}&t={starTime}&_={endTime}' submit_url = submit_url.format(dataId=data_id, lists=list_str, starTime=endTime, endTime=satrTime) con = self.tyc.session.get(url=submit_url, headers=submit_headers, proxies=proxies) print(con.text) Success = con.json().get('state') # state=ok 成功;state=fail 失败 # print(con.headers)成功{'Success': '1', 'Content-Encoding': 'gzip', 'Transfer-Encoding': 'chunked', 'Vary': 'Accept-Encoding', 'Connection': 'keep-alive', 'Date': 'Fri, 12 Oct 2018 09:14:19 GMT', 'Content-Type': 'application/json;charset=UTF-8'} 会有Success print("^" * 40) print(Success) # os.remove('./' + self.output_img) if Success == 'fail': self.get_company_list('北京当当网信息技术有限公司') else: # update_sql = "UPDATE tyc_user SET user_forbid=0 WHERE username='******'".format(self.username) # single_oracle.oracle_update(update_sql) if not self.check(): new_name = self.output_img.split( '.')[0] + results_json.replace('.', '_') + '.png' os.rename(self.output_img, new_name) self.cookie = self.tyc.get_cookies() single_reids.server.hdel('cookies', str(self.username)) single_reids.put_cookies(self.username, str(self.cookie)) single_reids.server.hdel('forbids', str(self.username)) self.login() self.get_company_list('北京当当网信息技术有限公司') else: print('没有被封') # update_sql = "UPDATE tyc_user SET user_forbid=0 WHERE username='******'".format(self.username) # single_oracle.oracle_update(update_sql) self.cookie = self.tyc.get_cookies() single_reids.server.hdel('cookies', str(self.username)) single_reids.put_cookies(self.username, self.cookie) single_reids.server.hdel('forbids', str(self.username)) self.login() self.get_company_list('北京当当网信息技术有限公司') except Exception as e: print(e)
class ReqCompany(object): def __init__(self, type): self.headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "Content-Type": "application/json; charset=UTF-8", "X-Requested-With": "XMLHttpRequest", "User-Agent": random.choice(USER_AGENTS), "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Host ": "www.tianyancha.com" } self.redisClient = single_redis self.tyc = RequestClass(proxies=proxies) self.mark_count = self.login() self.result_dicts = {} self.type = type self.phone = '' def check_login(self, con): if con and con.status_code == 200 and 'antirobot' not in con.url: return True elif con.status_code == 404: print("userName: {} forbid status_code={}".format( self.username, con.status_code)) return True elif not con or u"请输入您的手机号码" in con.text or con.status_code == 401 or '密码登录' in con.text: print("userName: {} cookie失效!! status_code={}".format( self.username, con.status_code)) single_redis.server.hdel('cookies', self.tyc.username) single_redis.server.lpush('users', self.tyc.username) self.login() return False elif u"我们只是确认一下你不是机器人" in con.text or 'antirobot' in con.url: print("userName: {} forbid status_code={}".format( self.username, con.status_code)) single_redis.put_cookies(phone=self.tyc.username, cookie=self.cookie, name='forbids') self.login() return False def login(self): self.username, self.cookie = self.tyc.login() def get_company_list(self, company_name): self.login() key = company_name # 网页抓取 # logger.error("Search Company %s") url = "https://www.tianyancha.com/search?key=%s&checkFrom=searchBox" % parse.quote( key) try: con = self.tyc.request_url(url=url, headers=self.headers) # 判断账号是否被封 if self.check_login(con): try: # 解析公司列表页面,搜索信息存入mysql,并返回公司详情url self.parse_company_list(company_name, con.text) # logger.error(self.result_dicts) return self.result_dicts except Exception as e: self.get_company_list(company_name) else: self.get_company_list(company_name) # logger.error(e) # self.mysqlClient.mysql_update(cache['table'], {'searched': 1, 'error': 1}, # {'company_number': cache['number']}) # logger.error("Exception Logged{}".format(e)) except requests.exceptions.ProxyError as error: # logger.error(error) # logger.error("Proxy Error! {}".format(error)) self.get_company_list(company_name) except Exception as e: # logger.error(e) self.get_company_list(company_name) def parse_company_list(self, company_name, text=''): """ 如果第一行公司名称与输入key一致,则取第一行,否则遍历第一页,如果第一页都没有与输入key一致的,则还取第一行 解析公司列表页面,存储基本信息到mysql,并返回公司名称和公司详情连接 :param key: 关键字 :param text: 网页内容 :return: ent_name, url """ # #logger.error('parse_company_list()') # logger.error("parse_company_list() {}".format(company_name)) key = company_name soup = BeautifulSoup(text, 'lxml') etree_xpath = etree.HTML(text) div_first = soup.find('div', class_="search-item sv-search-company") if div_first: company_divs = soup.find_all('div', class_="search-result-single") ent_name = company_divs[0].img['alt'] ent_name = ent_name.replace('<em>', '').replace('</em>', '').strip() if ent_name: ent_name = ent_name if self.type == 0: if key == ent_name: pass else: for company_div in company_divs: ent_name = company_div.img['alt'] ent_name = ent_name.replace('<em>', '').replace('</em>', '').strip() if key == ent_name: div_first = company_div break try: header_div = div_first.find('div', class_='header') if ent_name: ent_name = ent_name url = header_div.find('a', class_="name")["href"].strip() self.get_detail(url, ent_name) except Exception as e: print(e) # logger.error("Exception Logged: {}".format(e)) else: return 'None' def get_detail(self, url, key): """ 爬取公司详情页面,存入mongodb,解析详情并存入mysql :param ent_name: 公司名称 :param url: 公司详情url :return: """ # logger.error('get_detail() {}'.format(key)) try: con = self.tyc.request_url(url=url, headers=self.headers) if self.check_login(con): self.parse_detail(con.text, key) else: self.get_detail(url, key) except requests.exceptions.ProxyError as error: # logger.error(error) self.login() self.get_detail(key, url) # logger.error("Proxy Error!", error) except Exception as e: # logger.error(e) # logger.error("Login false!") # logger.error("Exception Logged {}".format(e)) print(e) self.login() self.get_detail(key, url) def parse_detail(self, text='', key=''): selectors = etree.HTML(text) top = (selectors.xpath('//div[@id="company_web_top"]')) baseinfo = {} table_lists = (selectors.xpath( '//div[contains(@id,"_container_baseInfo")]//table')) if top: top = top[0] detail_basic = top.xpath( './/div[@class="box -company-box "]//div[@class="content"]//div[contains(@class,"detail")]' ) if detail_basic: detail_basic = detail_basic[0] telephone = detail_basic.xpath('./div/div/span//script/text()') if telephone: self.phone = telephone[0] else: self.phone = None