def business_detail_spider(): stock_list=[] business_detail=SuperSpider(host='47.102.40.81',passwd='Abc12345',db='bryframe',table_name='business_detail',field_list=('spider_date','up_date','code','name','department_name','amount')) business_detail.up_date=business_detail.spider_date page=1 while True: try: json_data=business_detail.use_requests_to_html(f'http://data.eastmoney.com/DataCenter_V3/stock2016/ActiveStatistics/pagesize=50,page={page},sortRule=-1,sortType=JmMoney,startDate={business_detail.spider_date},endDate={business_detail.spider_date},gpfw=0,js=var%20data_tab_1.html?rt=25861061','GB2312') data_list=business_detail.json_to_py(json_data,deal=True)['data'] except: print(f'第{page}页获取失败') page+=1 continue if not data_list or page == 500: break print(f'第{page}页') for data in data_list: if not data['SName']: continue stock_data_list=business_detail.json_to_py(data['SName']) for stock_data in stock_data_list: if stock_data['CodeName'] not in stock_list: stock_list.append(stock_data['CodeName']) else: continue business_detail.name=stock_data['CodeName'] business_detail.code=stock_data['SCode'] try: url_code=business_detail.re_find(r'\d+',business_detail.code).__next__().group() except: continue print(url_code) url=f'http://data.eastmoney.com/stock/lhb,{business_detail.spider_date},{url_code}.html' try: business_detail.get_request(url) except: continue detail_data_list=list(business_detail.data_search('find','table tbody td')) for i,j in zip(range(1,71,7),range(6,71,7)): try: business_detail.department_name=detail_data_list[i].split('\n')[0] except: break business_detail.amount=detail_data_list[j] business_detail.data_save() print(f'每日成交明细——{business_detail.up_date}——{business_detail.code}——{business_detail.name}——{business_detail.department_name}——导入完成') page+=1 business_detail.spider_end()
def zggys_spider(): zggys=SuperSpider(use_selenium=True) zggys.source='中国供应商' zggys.website='-' zggys.get_request('https://cn.china.cn/') url_list1=(i+'?p={}' for i in zggys.data_search('xpath','//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a','href')) for url1 in url_list1: page=10 while True: print(f'第{page}页') try: zggys.get_request(url1.format(page)) except: print(f'获取第{page}页失败') page+=1 continue url_list2=zggys.data_search('find','h3.title a','href') if not url_list2: break for url2 in url_list2: try: zggys.get_request(url2) zggys.company_name=zggys.data_search('find','.column_xx p a','title').__next__() except: continue company_info_list=(i for i in zggys.data_search('find','.business_xx').__next__().split('\n') if '|' in i) company_info_dict={i.split('|')[0]:i.split('|')[1] for i in company_info_list} zggys.business_mode=company_info_dict.get('经营模式','-') zggys.register_money=company_info_dict.get('注册资本','-') zggys.company_type=company_info_dict.get('企业类型','-') zggys.main_product=company_info_dict.get('主营产品','-') zggys.address=company_info_dict.get('公司地址','-') #print(business_mode,register_money,company_type,main_product,address) zggys.person_name=zggys.data_search('find','.personal_top .t span').__next__() phone_list=zggys.data_search('find','.personal_bottom span') #print(phone_list) cell_phone_list=[] phone_code_list=[] for phone in phone_list: if not phone: js='var btn=document.querySelector(".see_a.inactive_scode");btn.click();' zggys.selenium_js(url2,js) zggys.cell_phone=zggys.selenium_search('css_selector','.inactive_top .number').__next__() phone_info_dict={i.split('\n')[0]:i.split('\n')[1].strip('QQ交谈') for i in zggys.selenium_search('css_selector','.inactive_right .txt p')} zggys.phone_code=phone_info_dict.get('电话','-') zggys.fax=phone_info_dict.get('传真','-') zggys.qq=phone_info_dict.get('Q Q','-') else: if not phone.startswith('1'): phone_code_list.append(phone) else: cell_phone_list.append(phone) if cell_phone_list or phone_code_list: zggys.phone_code='/'.join(phone_code_list) if phone_code_list else '-' zggys.cell_phone='/'.join(cell_phone_list) if cell_phone_list else '-' zggys.fax='-' zggys.qq='-' zggys.data_save() print(f'中国供应商——{zggys.company_name}信息导入完成') page+=1 zggys.spider_end()
def zgcpw_spider(): zgcpw = SuperSpider() company_list = deque([], maxlen=35) zgcpw.source_name = '中国产品网' zgcpw.get_request('http://www.pe168.com/') url_list1 = zgcpw.data_search('find', 'td div:nth-child(2) a', 'href') profession_list = zgcpw.data_search('find', 'td div:nth-child(2) a') for profession, url1 in zip(profession_list, url_list1): try: zgcpw.get_request(url1) page_all = zgcpw.data_search('find', '.pages cite').__next__() page_all_number = zgcpw.re_find(r'/(\d+)页', page_all).__next__().group(1) except: continue for page in range(1, int(page_all_number) + 1): print(f'{profession}——第{page}页') url2 = url1.replace('.html', f'-{page}.html') try: zgcpw.get_request(url2) except: continue url_list3 = zgcpw.data_search( 'find', '.left_box form tr ul li:nth-last-child(1) a', 'href') company_list3 = zgcpw.data_search( 'find', '.left_box form tr ul li:nth-last-child(1) a') for company_name, url3 in zip(company_list3, url_list3): if company_name in company_list: print('信息重复') continue company_list.append(company_name) zgcpw.company_name = company_name try: zgcpw.get_request(url3) except: continue zgcpw.source_page = url3 try: company_info_url = zgcpw.data_search( 'find', 'a[title="公司介绍"]', 'href').__next__() except: company_list.append(company_name) continue try: zgcpw.get_request(company_info_url) except: continue company_info_list = list( zgcpw.data_search('find', '.main_body:nth-last-child(1) td')) zgcpw.company_type = company_info_list[ company_info_list.index('公司类型:') + 1] if '公司类型:' in company_info_list else '-' zgcpw.staff_number = company_info_list[ company_info_list.index('公司规模:') + 1] if '公司规模:' in company_info_list else '-' zgcpw.register_money = company_info_list[ company_info_list.index('注册资本:') + 1] if '注册资本:' in company_info_list else '-' zgcpw.business_mode = company_info_list[ company_info_list.index('经营模式:') + 1] if '经营模式:' in company_info_list else '-' zgcpw.main_product = company_info_list[ company_info_list.index('经营范围:') + 1] if '经营范围:' in company_info_list else '-' try: phone_info_url = zgcpw.data_search('find', 'a[title="联系方式"]', 'href').__next__() except: company_list.append(company_name) continue try: zgcpw.get_request(phone_info_url) except: continue phone_info_list = list( zgcpw.data_search('find', '.px13.lh18 td')) zgcpw.address = phone_info_list[ phone_info_list.index('公司地址:') + 1] if '公司地址:' in phone_info_list else '-' zgcpw.fax = phone_info_list[ phone_info_list.index('公司传真:') + 1] if '公司传真:' in phone_info_list else '-' zgcpw.website = phone_info_list[ phone_info_list.index('公司网址:') + 1] if '公司网址:' in phone_info_list else '-' zgcpw.person_name = phone_info_list[ phone_info_list.index('联 系 人:') + 1] if '联 系 人:' in phone_info_list else '-' zgcpw.phone_number = phone_info_list[ phone_info_list.index('公司电话:') + 1] if '公司电话:' in phone_info_list else '-' zgcpw.data_save() zgcpw.phone_number = phone_info_list[ phone_info_list.index('手机号码:') + 1] if '手机号码:' in phone_info_list else '-' zgcpw.data_save() print(f'{profession}——第{page}页——{company_name}导入完成') zgcpw.spider_end()