def ip_check_run(): del_same() ip = SuperSpider(host='192.168.0.172') proxies_list = ip.sql_search('select ip from ip_pool') loop = asyncio.get_event_loop() loop.run_until_complete(ip_check_task(proxies_list, ip)) ip.spider_end()
def get_phone_number(): wx = SuperSpider() html = wx.get_html( 'http://192.168.30.200/api/check_wx/get_mobile.html?max_id=6000000&num=5000' ) data_list = wx.json_to_py(html) print(data_list)
def ip_spider3_run(): headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'acw_tc=792b121315526282356885461e0c5a3c57c2f3644ca7854ccdbc9c502d1ef5; ASPSESSIONIDSADQRDTR=ANOPOIJBEFGBOCBAJPPMKEDH; __51cke__=; Hm_lvt_8fd158bb3e69c43ab5dd05882cf0b234=1552552479,1552628110; __tins__16949115=%7B%22sid%22%3A%201552628109404%2C%20%22vd%22%3A%2018%2C%20%22expires%22%3A%201552630850318%7D; __51laig__=18; Hm_lpvt_8fd158bb3e69c43ab5dd05882cf0b234=1552629050', 'Host': 'ip.zdaye.com', 'Referer': 'http://ip.zdaye.com/dayProxy/3.html', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36' } ip = SuperSpider(host='192.168.0.172', table_name='ip_pool', field_list=[ 'spider_datetime', 'source_name', 'source_page', 'ip', 'address' ], headers=headers) proxies_list = ip.sql_search('select ip from ip_pool') loop = asyncio.get_event_loop() loop.run_until_complete(ip_spider3_task(ip, proxies_list)) ip.spider_end()
def stw_spider(): stw=SuperSpider() stw_list=deque([],maxlen=30) stw.source_name='商泰网' url_list1=stw.data_search(r'https://cn.made-in-china.com/','//div[@class="sub-cata"]//dd[@class="sub-cata-item-bd"]//a/@href') # print(url_list1) for url1 in url_list1: url1='https://cn.made-in-china.com'+url1 code=stw.data_search(url1,'//input[@name="code"]/@value') print(code)
def ip_spider6_run(): ip = SuperSpider(host='192.168.0.172', table_name='ip_pool', field_list=[ 'spider_datetime', 'source_name', 'source_page', 'ip', 'address' ]) proxies_list = ip.sql_search('select ip from ip_pool') loop = asyncio.get_event_loop() loop.run_until_complete(ip_spider6_task(ip, proxies_list)) ip.spider_end()
def run_all_spider(): run_all = SuperSpider(host='47.102.40.81', passwd='Abc12345', db='bryframe') while True: now = datetime.now() aim_date = f"{now.strftime('%m')}月{now.strftime('%d')}日" run_all.get_request('http://data.eastmoney.com/stock/tradedetail.html') data_date = run_all.data_search( 'find', '.cate_type_ul.cate_type_date .at').__next__() if aim_date != data_date: print('————今日数据未更新————') return else: break # lhb_rank_spider() # institution_business_spider() # stock_count_spider() # department_track_spider() # active_department_spider() # business_detail_spider() # department_count_spider() # stock_info_spider() # stock_report_spider() profession_report_spider() run_all.spider_end()
def run_all_spider(): run_all = SuperSpider(host='47.102.40.81', passwd='Abc12345', db='bryframe') while True: now = datetime.now() aim_date = f"{now.strftime('%m')}月{now.strftime('%d')}日" data_date = run_all.data_search( 'http://data.eastmoney.com/stock/tradedetail.html', '//ul[contains(@class,"cate_type_date")]/li[@class="at"]/text()', 'gb2312')[0] print(data_date) if aim_date != data_date: print('————今日数据未更新————') run_all.spider_end() return else: break for i in range(3): lhb_rank_spider() institution_business_spider() stock_count_spider() department_track_spider() active_department_spider() business_detail_spider() department_count_spider() stock_info_spider() stock_report_spider() profession_report_spider() bonus_data_spider() stock_data_spider() time.sleep(1800) run_all.spider_end()
def xarcw_spider(): f=Faker(locale='zh_CN') word_list=['python','web','数据库','运维'] xarcw=SuperSpider(db='supery',table_name='post_tag',default_field='null',field_list=('post_id','tag_id')) # data={ # 'memberName': '13155291086', # 'password': '******'} # xarcw.post_request('https://login.goodjobs.cn/index.php/action/UserLogin',data=data) post_list=xarcw.sql_search('select id from post') tag_list=xarcw.sql_search('select id from tag') number_list=(2,3,4,5) for post in post_list: number=random.choice(number_list) aim_tag=random.sample(tag_list,number) for tag in aim_tag: xarcw.post_id=post[0] xarcw.tag_id=tag[0] xarcw.data_save() print(f'{xarcw.post_id}-{xarcw.tag_id}-导入完成')
def stock_count_spider(): stock_count = SuperSpider(host='47.102.40.81', passwd='Abc12345', db='bryframe', table_name='stock_count', field_list=('spider_date', 'up_date', 'code', 'name', 'list_time', 'buy_sum', 'sell_sum', 'buy_amount')) month_ago = stock_count.date_ago(30) page = 1 while True: try: json_data = stock_count.use_requests_to_html( f'http://data.eastmoney.com/DataCenter_V3/stock2016/StockStatistic/pagesize=50,page={page},sortRule=-1,sortType=,startDate={month_ago},endDate={stock_count.spider_date},gpfw=0,js=var%20data_tab_3.html?rt=25754758', 'GB2312') data_list = stock_count.json_to_py(json_data, deal=True)['data'] except: print(f'第{page}页获取失败') page += 1 continue if not data_list or page == 500: break print(f'第{page}页') for data in data_list: stock_count.up_date = data['Tdate'] stock_count.code = data['SCode'] stock_count.name = data['SName'] stock_count.list_time = stock_count.to_null(data['SumCount']) stock_count.buy_sum = stock_count.to_null(data['Bmoney']) stock_count.sell_sum = stock_count.to_null(data['Smoney']) stock_count.buy_amount = stock_count.to_null(data['JmMoney']) stock_count.data_save() print( f'个股龙虎榜统计:{stock_count.up_date}-{stock_count.code}-{stock_count.name}-导入完成' ) page += 1 stock_count.spider_end() print('end:个股龙虎榜统计')
def ht_spider(start_time='2019-01-02 00:00:00',end_time='2019-04-02 00:00:00'): ht=SuperSpider(use_selenium=True,default_field='null',field_list=('start_time','call_duration','connect_duration','talk_duration','ring_duration','call_direction','connect_status','sound_file','customer_id','caller','called','caller_department','caller_number','caller_user_name','project_name','call_type'),table_name='ht_data') ht.selenium_get(r'http://210.13.87.106:8088/ec2') ht.selenium_click('//td[@tabindex="-1"]//div[@class="v-captiontext"]',3) ht.selenium_input('//input[@class="v-textfield"]','mgrdefault8',index=3) ht.selenium_input('//input[@class="v-textfield"]','fuyan2018',index=-1) ht.selenium_click('//span[@class="v-button-caption"]',3,index=1) ht.selenium_click('//span[@class="v-nativebutton-caption"]',3,index=2) ht.selenium_input('//input[@class="v-textfield v-datefield-textfield"]',start_time,index=0) ht.selenium_input('//input[@class="v-textfield v-datefield-textfield"]',end_time,index=0) ht.selenium_click('//div[@class="v-filterselect-button"]',index=2) ht.selenium_click('//td[@class="gwt-MenuItem"]/span',index=0) ht.selenium_click('//div[@class="v-button v-button-default default"]//span[@class="v-button-caption"]',3,index=0) page_all=ht.selenium_search('//*[@id="ec2-100180"]/div/div[2]/div/div[2]/div/div/div/div[1]/div/div/div/div[1]/div/div[2]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div/div/div[2]/div/div/div/div[2]/div/div/div/div[7]/div/div')[0] page_number=ht.re_find(r'/(\d+)页',page_all).__next__().group(1) for page in range(1,int(page_number)): html=ht.page_source() data_list=ht.data_search(html=html,xpath='//td[@class="v-table-cell-content"]//text()') for i,index1,index2 in zip(range(1,1000),range(0,1000,18),range(18,1000,18)): split_list=data_list[index1:index2] if split_list: split_list.pop(8) split_list.pop(8) for field,data in zip(ht.field_list,split_list): exec(f'ht.{field}=data') ht.data_save() print(f'第{page}页——第{i}条数据——导入完成') else: break ht.selenium_click('//*[@id="ec2-100180"]/div/div[2]/div/div[2]/div/div/div/div[1]/div/div/div/div[1]/div/div[2]/div/div[2]/div/div/div[2]/div/div/div/div[2]/div/div/div/div[2]/div/div/div/div[2]/div/div/div/div[3]/div/div/span/span',3) ht.spider_end()
def lhb_rank_spider(): lhb_rank = SuperSpider( host='47.102.40.81', passwd='Abc12345', db='bryframe', table_name='lhb_rank', field_list=('spider_date', 'up_date', 'code', 'name', 'close_price', 'up_down', 'buy_amount', 'change_rate', 'currency_market')) page = 1 while True: try: json_data = lhb_rank.use_requests_to_html( f'http://data.eastmoney.com/DataCenter_V3/stock2016/TradeDetail/pagesize=200,page={page},sortRule=-1,sortType=,startDate={lhb_rank.spider_date},endDate={lhb_rank.spider_date},gpfw=0,js=var%20data_tab_1.html?rt=25754497', 'GB2312') data_list = lhb_rank.json_to_py(json_data, deal=True)['data'] except: print(f'第{page}页获取失败') page += 1 continue if not data_list or page == 500: break print(f'第{page}页') for data in data_list: lhb_rank.up_date = lhb_rank.spider_date lhb_rank.code = data['SCode'] lhb_rank.name = data['SName'] lhb_rank.close_price = lhb_rank.to_null(data['ClosePrice']) lhb_rank.up_down = lhb_rank.to_null(data['Chgradio']) lhb_rank.buy_amount = lhb_rank.to_null(data['JmMoney']) lhb_rank.change_rate = lhb_rank.to_null(data['Dchratio']) lhb_rank.currency_market = lhb_rank.to_null(data['Ltsz']) lhb_rank.data_save() print( f'当日龙虎榜涨跌幅排名:{lhb_rank.up_date}-{lhb_rank.code}-{lhb_rank.name}-导入完成' ) page += 1 lhb_rank.spider_end() print('end:龙虎榜当日跌幅排名')
def profession_report_spider(): profession_report = SuperSpider( table_name='profession_report', field_list=('name', 'spider_date', 'up_date', 'up_down', 'report', 'grade', 'grade_change', 'institution')) sql1 = 'select MAX(up_date) from profession_report' latest_time = profession_report.sql_search(sql1)[0][0] if not latest_time: latest_datetime = datetime.now() - timedelta(days=1) else: latest_datetime = datetime(latest_time.year, latest_time.month, latest_time.day) is_end = False for page in range(1, 1337): url = 'http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=HYSR&mkt=0&stat=0&cmd=4&code=&sc=&ps=50&p=' + str( page ) + '&js=var%20vMcgaFDg={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&rt=51553086' try: json_data = profession_report.use_requests_to_html(url, 'utf8') data_list = profession_report.json_to_py(json_data, deal=True)['data'] except: print(f'第{page}页获取失败') page += 1 continue for data in data_list: data = data.split(',') time1 = data[1].split(' ')[0].replace('/', '-') datetime1 = datetime.strptime(time1, '%Y-%m-%d') if datetime1 <= latest_datetime: print('暂无数据更新') is_end = True break infocode = data[2] time2 = time1.replace('-', '') try: profession_report.get_request( f'http://data.eastmoney.com/report/{time2}/{infocode}.html' ) except: continue report = '' for par in profession_report.data_search('find', '.newsContent p'): report = report + par profession_report.name = data[10] profession_report.up_date = time1 profession_report.up_down = profession_report.to_null(data[11]) profession_report.report = report profession_report.grade = data[7] profession_report.grade_change = data[0] profession_report.institution = data[4] profession_report.data_save() print( f'行业研报:{profession_report.up_date}-{profession_report.name}-{profession_report.institution}-导入完成' ) if is_end == True: break profession_report.spider_end() print('end:行业研报')
def institution_business_spider(): institution_business = SuperSpider( host='47.102.40.81', passwd='Abc12345', db='bryframe', table_name='institution_business', field_list=('spider_date', 'up_date', 'code', 'name', 'buy_number', 'sell_number', 'buy_sum', 'sell_sum', 'buy_amount')) page = 1 while True: try: json_data = institution_business.use_requests_to_html( f'http://data.eastmoney.com/DataCenter_V3/stock2016/DailyStockListStatistics/pagesize=50,page={page},sortRule=-1,sortType=PBuy,startDate={institution_business.spider_date},endDate={institution_business.spider_date},gpfw=0,js=var%20data_tab_1.html?rt=25754580', 'GB2312') data_list = institution_business.json_to_py(json_data, deal=True)['data'] except: print(f'第{page}页获取失败') page += 1 continue if not data_list or page == 500: break print(f'第{page}页') for data in data_list: institution_business.up_date = institution_business.spider_date institution_business.code = data['SCode'] institution_business.name = data['SName'] institution_business.buy_number = institution_business.to_null( data['BSL']) institution_business.sell_number = institution_business.to_null( data['SSL']) institution_business.buy_sum = institution_business.to_null( data['BMoney']) institution_business.sell_sum = institution_business.to_null( data['SMoney']) institution_business.buy_amount = institution_business.to_null( data['PBuy']) institution_business.data_save() print( f'机构买卖情况:{institution_business.up_date}-{institution_business.code}-{institution_business.name}-导入完成' ) page += 1 institution_business.spider_end() print('end:机构买卖情况')
def profession_report_spider(): profession_report_list = [] profession_report = SuperSpider( host='47.102.40.81', passwd='Abc12345', db='bryframe', table_name='profession_report', field_list=('name', 'spider_date', 'up_date', 'up_down', 'report', 'grade', 'grade_change', 'institution')) sql1 = 'select MAX(up_date) from profession_report' latest_time = profession_report.sql_search(sql1)[0][0] if not latest_time: latest_datetime = datetime.now() - timedelta(days=1) else: latest_datetime = datetime(latest_time.year, latest_time.month, latest_time.day) is_end = False for page in range(1, 1337): url = 'http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=HYSR&mkt=0&stat=0&cmd=4&code=&sc=&ps=50&p=' + str( page ) + '&js=var%20vMcgaFDg={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&rt=51553086' try: json_data = profession_report.get_html(url) data_list = profession_report.json_to_py(json_data, deal=True)['data'] except Exception as error: print(f'第{page}页获取失败') print(error) page += 1 continue for data in data_list: data = data.split(',') time1 = data[1].split(' ')[0].replace('/', '-') profession_report.name = data[10] profession_report.up_date = time1 datetime1 = datetime.strptime(time1, '%Y-%m-%d') if datetime1 <= latest_datetime: print('暂无数据更新') is_end = True break infocode = data[2] time2 = time1.replace('-', '') profession_report.up_down = profession_report.to_null(data[11]) try: profession_report.report = (''.join( profession_report.data_search( f'http://data.eastmoney.com/report/{time2}/{infocode}.html', '//div[@class="newsContent"]/text()', 'gb2312'))).strip() except: pass sql = f'select name from profession_report where name="{profession_report.name}" and spider_date="{profession_report.spider_date}" and up_date="{profession_report.up_date}" and report="{profession_report.report}"' same_data = profession_report.sql_search(sql) profession_report.grade = data[7] profession_report.grade_change = data[0] profession_report.institution = data[4] profession_report.data_save() print( f'行业研报:{profession_report.up_date}-{profession_report.name}-{profession_report.institution}-导入完成' ) if is_end == True: break profession_report.spider_end() print('end:行业研报')
def stock_report_spider(): stock_report = SuperSpider( host='47.102.40.81', passwd='Abc12345', db='bryframe', table_name='stock_report', field_list=('code', 'name', 'spider_date', 'up_date', 'report', 'grade', 'grade_change', 'institution', 'income_2018', 'rate_2018', 'income_2019', 'rate_2019')) sql1 = 'select MAX(up_date) from stock_report' latest_time = stock_report.sql_search(sql1)[0][0] if not latest_time: latest_datetime = datetime.now() - timedelta(days=1) else: latest_datetime = datetime(latest_time.year, latest_time.month, latest_time.day) is_end = False for page in range(1, 254): url = 'http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=GGSR&js=var%20MILbIdwm={"data":[(x)],"pages":"(pc)","update":"(ud)","count":"(count)"}&ps=50&p=' + str( page) + '&mkt=0&stat=0&cmd=2&code=&rt=51552935' try: json_data = stock_report.use_requests_to_html(url, 'utf8') data_list = stock_report.json_to_py(json_data, deal=True)['data'] except: print(f'第{page}页获取失败') page += 1 continue for data in data_list: time1 = data['datetime'][:10] datetime1 = datetime.strptime(time1, '%Y-%m-%d') if datetime1 <= latest_datetime: print('暂无数据更新') is_end = True break infocode = data['infoCode'] time2 = time1.replace('-', '') try: stock_report.get_request( f'http://data.eastmoney.com/report/{time2}/{infocode}.html' ) except: continue report = '' for par in stock_report.data_search('find', '#ContentBody .newsContent p'): report = report + par stock_report.code = data['secuFullCode'] stock_report.name = data['secuName'] stock_report.up_date = stock_report.spider_date stock_report.report = report stock_report.grade = data['rate'] stock_report.grade_change = data['change'] stock_report.institution = data['insName'] stock_report.income_2018 = stock_report.to_null(data['sys'][0]) stock_report.rate_2018 = stock_report.to_null(data['syls'][0]) stock_report.income_2019 = stock_report.to_null(data['sys'][1]) stock_report.rate_2019 = stock_report.to_null(data['syls'][1]) stock_report.data_save() print( f'个股研报:{stock_report.spider_date}-{stock_report.code}-{stock_report.name}-导入完成' ) if is_end == True: break stock_report.spider_end() print('end:个股研报')
def zggys_spider(): zggys=SuperSpider(use_selenium=True) zggys.source='中国供应商' zggys.website='-' zggys.get_request('https://cn.china.cn/') url_list1=(i+'?p={}' for i in zggys.data_search('xpath','//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a','href')) for url1 in url_list1: page=10 while True: print(f'第{page}页') try: zggys.get_request(url1.format(page)) except: print(f'获取第{page}页失败') page+=1 continue url_list2=zggys.data_search('find','h3.title a','href') if not url_list2: break for url2 in url_list2: try: zggys.get_request(url2) zggys.company_name=zggys.data_search('find','.column_xx p a','title').__next__() except: continue company_info_list=(i for i in zggys.data_search('find','.business_xx').__next__().split('\n') if '|' in i) company_info_dict={i.split('|')[0]:i.split('|')[1] for i in company_info_list} zggys.business_mode=company_info_dict.get('经营模式','-') zggys.register_money=company_info_dict.get('注册资本','-') zggys.company_type=company_info_dict.get('企业类型','-') zggys.main_product=company_info_dict.get('主营产品','-') zggys.address=company_info_dict.get('公司地址','-') #print(business_mode,register_money,company_type,main_product,address) zggys.person_name=zggys.data_search('find','.personal_top .t span').__next__() phone_list=zggys.data_search('find','.personal_bottom span') #print(phone_list) cell_phone_list=[] phone_code_list=[] for phone in phone_list: if not phone: js='var btn=document.querySelector(".see_a.inactive_scode");btn.click();' zggys.selenium_js(url2,js) zggys.cell_phone=zggys.selenium_search('css_selector','.inactive_top .number').__next__() phone_info_dict={i.split('\n')[0]:i.split('\n')[1].strip('QQ交谈') for i in zggys.selenium_search('css_selector','.inactive_right .txt p')} zggys.phone_code=phone_info_dict.get('电话','-') zggys.fax=phone_info_dict.get('传真','-') zggys.qq=phone_info_dict.get('Q Q','-') else: if not phone.startswith('1'): phone_code_list.append(phone) else: cell_phone_list.append(phone) if cell_phone_list or phone_code_list: zggys.phone_code='/'.join(phone_code_list) if phone_code_list else '-' zggys.cell_phone='/'.join(cell_phone_list) if cell_phone_list else '-' zggys.fax='-' zggys.qq='-' zggys.data_save() print(f'中国供应商——{zggys.company_name}信息导入完成') page+=1 zggys.spider_end()
def department_count_spider(): department_count = SuperSpider( host='47.102.40.81', passwd='Abc12345', db='bryframe', table_name='department_count', field_list=('spider_date', 'up_date', 'name', 'list_time', 'buy_time', 'buy_sum', 'sell_time', 'sell_sum')) month_ago = department_count.date_ago(30) page = 1 while True: try: json_data = department_count.use_requests_to_html( f'http://data.eastmoney.com/DataCenter_V3/stock2016/TraderStatistic/pagesize=50,page={page},sortRule=-1,sortType=,startDate={month_ago},endDate={department_count.spider_date},gpfw=0,js=var%20data_tab_1.html?rt=25754789', 'GB2312') data_list = department_count.json_to_py(json_data, deal=True)['data'] except: print(f'第{page}页获取失败') page += 1 continue if not data_list or page == 500: break print(f'第{page}页') for data in data_list: department_count.up_date = department_count.spider_date department_count.name = data['SalesName'] department_count.list_time = department_count.to_null( data['UpCount']) department_count.buy_time = department_count.to_null( data['BCount']) department_count.buy_sum = department_count.to_null( data['SumActBMoney']) department_count.sell_time = department_count.to_null( data['SCount']) department_count.sell_sum = department_count.to_null( data['SumActSMoney']) department_count.data_save() print( f'证券营业部上榜统计:{department_count.up_date}-{department_count.name}-导入完成' ) page += 1 department_count.spider_end() print('end:证券营业部上榜统计')
def zjmyqyw(): zjmyqyw=SuperSpider() zjmyqyw.source='浙江名营企业网' zjmyqyw.fax='-' zjmyqyw.get_request('http://www.zj123.com/') url_list1=('http://www.zj123.com/'+i.replace('1.','{}.') for i in zjmyqyw.data_search('find','.indsort dd a','href')) for url1 in url_list1: page=1 while True: print(f'第{page}页') zjmyqyw.get_request(url1.format(page)) page_judge=zjmyqyw.data_search('find','.sleft .m.m1 .fred').__next__().split()[0] if int(page_judge) != page: break print(page_judge) url_list2=('http://www.zj123.com/member/VIPContact/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href')) url_list3=('http://www.zj123.com/member/VIPCompany/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href')) #print(url_list2) for url2,url3 in zip(url_list2,url_list3): zjmyqyw.get_request(url2) contact_info_dict={i.split(':')[0].strip():i.split(':')[-1].strip().replace('\xa0','') for i in zjmyqyw.data_search('find','.rkbody table tr')} zjmyqyw.company_name=contact_info_dict['公司名称'] if contact_info_dict['公司名称'] else '-' zjmyqyw.person_name=contact_info_dict['联系人'] if contact_info_dict['联系人'] else '-' zjmyqyw.address=contact_info_dict['地 址'] if contact_info_dict['地 址'] else '-' zjmyqyw.phone_code=contact_info_dict['电 话'] if contact_info_dict['电 话'] else '-' zjmyqyw.cell_phone=contact_info_dict['手机'] if contact_info_dict['手机'] else '-' zjmyqyw.qq=contact_info_dict['QQ'] if contact_info_dict['QQ'] else '-' zjmyqyw.website=contact_info_dict['网 址'] if contact_info_dict['网 址'] else '-' zjmyqyw.get_request(url3) company_info_list=list(zjmyqyw.data_search('find','.rkbody table tr td')) company_info_dict={company_info_list[n].strip(': '):company_info_list[n+1].strip(': ') for n in range(0,24,2)} #print(company_info_dict) zjmyqyw.main_product=company_info_dict['主营产品或服务'] if company_info_dict['主营产品或服务'] else '-' zjmyqyw.business_mode=company_info_dict['经营模式'] if company_info_dict['经营模式'] else '-' zjmyqyw.company_type=company_info_dict['企业类型'] if company_info_dict['企业类型'] else '-' zjmyqyw.register_money=company_info_dict['注册资本'] if company_info_dict['注册资本'] else '-' zjmyqyw.data_save() print(f'浙江企业网——{zjmyqyw.company_name}信息导入完成') page+=1 zjmyqyw.spider_end() #zjmyqyw() # test_obj=SuperSpider() # js='var btn=document.querySelector(".see_a.inactive_scode");btn.click();' # test_obj.use_selenium() # test_obj.selenium_js('https://www.china.cn/shukongjichuang/3746553522.html',js) # test_obj.cell_phone=test_obj.selenium_search('css_selector','.inactive_top .number').__next__() # print('aaaaaaa') # print(test_obj.cell_phone)
def zggys_spider(): zggys = SuperSpider(host='192.168.0.172', default_field='-') zggys.source_name = '中国供应商' proxies_list = zggys.sql_search('select ip from ip_pool') url_list1 = [ i + '?p={}' for i in zggys.data_search( 'https://cn.china.cn/', '//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a/@href' ) ] profession_list = zggys.data_search( 'https://cn.china.cn/', '//*[@id="content"]/div[1]/div[1]/div/div[2]/div/div[2]/div/ul/li/div[2]/a/text()', 'GBK') error_index = profession_list.index('睡袋') for url1, profession in zip(url_list1[error_index:], profession_list[error_index:]): page = 1 while True: time.sleep(2) print(f'{profession}——第{page}页') for i in range(20): proxies = random.choice(proxies_list)[0] print(f'使用代理-{proxies}') key = 'http' if not proxies.startswith('https') else 'https' try: url_list2 = zggys.data_search( url1.format(page), '//ul[@class="extension_ul"]//h3[@class="title"]/a/@href', 'GBK', proxies={key: proxies}, timeout=5) except Exception as error: print(error) continue if not url_list2: print(f'{profession}——第{page}页——没有数据') break for url2 in url_list2: for i in range(20): try: time.sleep(2) proxies = random.choice(proxies_list)[0] print(f'使用代理-{proxies}') key = 'http' if not proxies.startswith( 'https') else 'https' html = zggys.get_html(url2, charset='GBK', proxies={key: proxies}, timeout=5) zggys.source_page = url2 if zggys.data_search( html=html, xpath='//div[@class="column_xx"]//p//a/text()' ): zggys.company_name = zggys.data_search( html=html, xpath='//div[@class="column_xx"]//p//a/text()' )[0] company_info_list = [ i for i in zggys.data_search( html=html, xpath='//ul[@class="business_xx"]//li//text()') if i.strip('\r\n |') ] # print(company_info_list) except Exception as error: print(error) continue else: try: aim_index = company_info_list.index('经营模式') zggys.business_mode = company_info_list[aim_index + 1] except: pass try: aim_index = company_info_list.index('注册资本') zggys.register_money = company_info_list[ aim_index + 1].strip() except: pass try: aim_index = company_info_list.index('企业类型') zggys.company_type = company_info_list[aim_index + 1] except: pass try: aim_index = company_info_list.index('主营产品') zggys.main_product = company_info_list[aim_index + 1] except: pass try: aim_index = company_info_list.index('公司地址') zggys.address = company_info_list[aim_index + 1] except: pass try: zggys.person_name = zggys.data_search( html=html, xpath= '//div[@class="personal_top"]//div[@class="t"]//span/text()' )[0] except: pass phone_list = zggys.data_search( html=html, xpath='//div[@class="personal_bottom"]//span/text()' ) if not phone_list: # js=['var btn=document.querySelector(".see_a.inactive_scode");btn.click();'] # try: # zggys.selenium_open(url2) # zggys.selenium_js(js,sleep_time=2) # zggys.phone_number=zggys.selenium_search('css_selector','.inactive_top .number').__next__() # phone_info_dict={i.split('\n')[0]:i.split('\n')[1].strip('QQ交谈') for i in zggys.selenium_search('css_selector','.inactive_right .txt p')} # except: # continue # zggys.fax=phone_info_dict.get('传真','-').strip() # zggys.qq=phone_info_dict.get('Q Q','-').strip() # zggys.data_save() # zggys.phone_number=phone_info_dict.get('电话','-').strip() # zggys.data_save() break for phone in phone_list: zggys.phone_number = phone.strip() zggys.data_save() print( f'{profession}—第{page}页—{zggys.company_name}信息导入完成' ) break page += 1 zggys.spider_end()
def xarcw_spider(): word_list = ['网络'] xarcw = SuperSpider(host='192.168.0.172', default_field='-') xarcw.source_name = '新安人才网' data = {'memberName': '13155291086', 'password': '******'} xarcw.post_request('https://login.goodjobs.cn/index.php/action/UserLogin', data=data) for word in word_list: for city_code in range(1043, 1061): for page in range(1, 61): print(f'{word}-{city_code}-第{page}页') try: url_list = xarcw.data_search( f'https://search.goodjobs.cn/index.php?keyword={word}&boxwp=c{city_code}&page={page}', '//div[@class="dw_table"]//span[@class="e1"]/a/@href') except: print(f'{word}-{city_code}-第{page}页获取失败') continue if not url_list: print(f'{word}-{city_code}-第{page}页-爬取结束') break for url in url_list: # print(url) xarcw.source_page = url time.sleep(1) data_list = xarcw.data_search(url, [ '//p[@class="cname"]/a/text()', '//p[@class="msg ltype"]/text()', '//div[@class="w706 clearfix"]/text()', '//div[@class="w706 clearfix"]/img/@src', '//div[@class="comadress clearfix"]/text()' ]) if not data_list[0] or not data_list[3]: continue if not data_list[0]: data_list = xarcw.data_search(url, [ '//div[@class="w240 whitespace pb16"]//a[@class="org"]/text()', '//div[@class="w240 whitespace pb16"]//p[@class="grey lh28"]/span[@class="black"]/text()', '//p[@class="duol mt20"]/text()', '//p[@class="duol mt20"]/img/@src', '//div[@class="comadress clearfix"]/text()' ]) xarcw.company_type = data_list[1][0] xarcw.main_product = data_list[1][2] else: company_info_list = [ i.strip('\xa0\xa0\n ') for i in data_list[1][0].split('|') ] xarcw.company_type = company_info_list[0] for j in company_info_list[1:]: if '-' in j: xarcw.staff_number = j else: xarcw.main_product = j xarcw.company_name = data_list[0][0] xarcw.person_name = [i for i in data_list[2] if i.strip()][0] try: xarcw.phone_number = xarcw.use_tesseract( url=data_list[3][0], lang=None) except: continue xarcw.address = data_list[4][0].strip('工作地点:\u3000\n ') xarcw.data_save() print( f'{xarcw.company_name}-{xarcw.person_name}-{xarcw.phone_number}-导入完成' )
def stock_data_spider(): data_end = None stock_data = SuperSpider(host='139.224.115.44', passwd='A9Vg+Dr*nP^fR=1V', db='bryframe3', table_name='stock_data', field_list=('spider_date', 'up_date', 'code', 'name', 'stock_rate', 'stock_price')) page = 1 while True: print(f'第{page}页') url = 'http://datainterface.eastmoney.com/EM_DataCenter/JS.aspx?type=NS&sty=NSA&st=6&sr=-1&p=' + str( page) + '&ps=50&js=var%20inHqdtrZ={pages:(pc),data:[(x)]}&rt=5174' try: json_data = stock_data.get_html(url) data_list = stock_data.json_to_py(json_data, deal=True)['data'] if data_list[:3] == data_end: break else: data_end = data_list[:3] except: print(f'第{page}页获取失败') page += 1 continue if not data_list or page == 500: break for data in data_list: field_list = data.split(',') stock_data.code = field_list[2] stock_data.name = field_list[3] stock_data.stock_rate = '10配' + field_list[6] stock_data.stock_price = stock_data.to_null(field_list[7]) stock_data.up_date = field_list[14] if field_list[14] else 'null' sql = f'select code from stock_data where code="{stock_data.code}" and spider_date="{stock_data.spider_date}" and up_date="{stock_data.up_date}"' same_data = stock_data.sql_search(sql) if same_data: stock_data.sql_search( f'delete from stock_data where code="{stock_data.code}" and spider_date="{stock_data.spider_date}" and up_date="{stock_data.up_date}"' ) print( f'重新爬取-{stock_data.spider_date}-{stock_data.code}-{stock_data.name}' ) stock_data.data_save() print( f'{stock_data.up_date}-{stock_data.code}-{stock_data.name}-导入完成' ) page += 1 stock_data.spider_end()
def bonus_data_spider(): bonus_data = SuperSpider( host='139.224.115.44', passwd='A9Vg+Dr*nP^fR=1V', db='bryframe3', table_name='bonus_data', field_list=('spider_date', 'bonus_report_date', 'code', 'name', 'cash_bonus_rate', 'transfer_rate', 'plan_announce_date', 'stock_register_date', 'remove_date', 'plan_scheduler', 'latest_announce_date')) date_list = bonus_data.data_search( 'http://data.eastmoney.com/yjfp/201812.html', '//select[@id="sel_bgq"]/option/text()', 'gb2312') year_ago_datetime = bonus_data.to_datetime(bonus_data.date_ago(365)) date_list2 = [] for aim_date in date_list: if year_ago_datetime <= bonus_data.to_datetime(str(aim_date)): date_list2.append(aim_date) else: break for use_date in date_list2: bonus_data.bonus_report_date = use_date page = 1 while True: print(f'第{page}页') try: json_data = bonus_data.get_html( f'http://data.eastmoney.com/DataCenter_V3/yjfp/getlist.ashx?js=var%20aTnZIWfZ&pagesize=50&page={page}&sr=-1&sortType=YAGGR&mtk=%C8%AB%B2%BF%B9%C9%C6%B1&filter=(ReportingPeriod=^{use_date}^)&rt=51742239', 'GB2312') data_list = bonus_data.json_to_py(json_data, deal=True)['data'] except: print(f'第{page}页获取失败') page += 1 continue if not data_list or page == 500: break for data in data_list: bonus_data.code = data['Code'] bonus_data.name = data['Name'] bonus_data.latest_announce_date = bonus_data.to_null( data['NoticeDate'][:10]) sql = f'select code from bonus_data where code="{bonus_data.code}" and spider_date="{bonus_data.spider_date}" and latest_announce_date="{bonus_data.latest_announce_date}"' same_data = bonus_data.sql_search(sql) if same_data: bonus_data.sql_search( f'delete from bonus_data where code="{bonus_data.code}" and spider_date="{bonus_data.spider_date}" and latest_announce_date="{bonus_data.latest_announce_date}"' ) print( f'重新爬取-{bonus_data.spider_date}-{bonus_data.code}-{bonus_data.name}' ) bonus_data.plan_announce_date = bonus_data.to_null( data['ResultsbyDate'][:10]) bonus_data.stock_register_date = bonus_data.to_null( data['GQDJR'][:10]) bonus_data.remove_date = bonus_data.to_null(data['CQCXR'][:10]) bonus_data.plan_scheduler = data['ProjectProgress'] group_data = data['AllocationPlan'] try: bonus_data.cash_bonus_rate = '10' + bonus_data.re_find( r'派[\d\.]+', group_data).__next__().group() + '元(含税)' except: bonus_data.cash_bonus_rate = 'null' try: transfer_rate1 = bonus_data.re_find( r'转[\d\.]+', group_data).__next__().group() except: transfer_rate1 = '' try: transfer_rate2 = bonus_data.re_find( r'送[\d\.]+', group_data).__next__().group() except: transfer_rate2 = '' if not transfer_rate1 and not transfer_rate2: bonus_data.transfer_rate = 'null' else: bonus_data.transfer_rate = '10' + transfer_rate2 + transfer_rate1 bonus_data.data_save() print( f'{bonus_data.bonus_report_date}-{bonus_data.code}-{bonus_data.name}-导入完成' ) page += 1 bonus_data.spider_end()
import aiohttp import asyncio from super_spider import SuperSpider ip = SuperSpider(host='192.168.0.172') session = aiohttp.ClientSession() async def ip_check(sem, proxies): async with sem: try: key = 'http' if not proxies.startswith('https') else 'https' url = f'{key}://www.baidu.com' async with session.get(url, headers=ip.random_headers(), proxy=proxies, timeout=3) as response: status_code = response.status if status_code != 200: ip.sql_search(f'delete from ip_pool where ip="{proxies}"') print(f'{proxies}-不可用已删除') else: print(f'{proxies}-可用') except: ip.sql_search(f'delete from ip_pool where ip="{proxies}"') print(f'{proxies}-不可用已删除') proxies_list = ip.sql_search('select ip from ip_pool') async def split_task():
def stock_info_spider(): stock_info = SuperSpider(host='47.102.40.81', passwd='Abc12345', db='bryframe', table_name='stock_info', field_list=('code', 'name', 'spider_date', 'up_date', 'highest', 'lowest', 'today', 'yesterday')) for page in range(1, 181): try: json_data = stock_info.use_requests_to_html( f'http://nufm.dfcfw.com/EM_Finance2014NumericApplication/JS.aspx?cb=jQuery11240974473783255319_1545290975192&type=CT&token=4f1862fc3b5e77c150a2b985b12db0fd&sty=FCOIATC&js=(%7Bdata%3A%5B(x)%5D%2CrecordsFiltered%3A(tot)%7D)&cmd=C._A&st=(ChangePercent)&sr=-1&p={page}&ps=20&_=1545290975206', 'utf8') data_list = stock_info.json_to_py(json_data, deal=True)['data'] except: print(f'第{page}页获取失败') page += 1 continue print(f'第{page}页') for data_str in data_list: data = data_str.replace('-', 'null').split(',') stock_info.code = data[1] stock_info.name = data[2] stock_info.spider_date = stock_info.spider_date stock_info.up_date = stock_info.spider_date stock_info.highest = stock_info.to_null(data[9]) stock_info.lowest = stock_info.to_null(data[10]) stock_info.today = stock_info.to_null(data[11]) stock_info.yesterday = stock_info.to_null(data[12]) stock_info.data_save() print( f'行情中心:{stock_info.up_date}-{stock_info.code}-{stock_info.name}-导入完成' ) page += 1 stock_info.spider_end() print('end:行情中心')
def department_track_spider(): department_track = SuperSpider( host='47.102.40.81', passwd='Abc12345', db='bryframe', table_name='department_track', field_list=('spider_date', 'up_date', 'code', 'name', 'list_time', 'buy_sum', 'buy_time', 'sell_time', 'buy_amount', 'up_down')) month_ago = department_track.date_ago(30) page = 1 while True: try: json_data = department_track.use_requests_to_html( f'http://data.eastmoney.com/DataCenter_V3/stock2016/JgStatistic/pagesize=50,page={page},sortRule=-1,sortType=,startDate={month_ago},endDate={department_track.spider_date},gpfw=0,js=var%20data_tab_3.html?rt=25754592', 'GB2312') data_list = department_track.json_to_py(json_data, deal=True)['data'] except: print(f'第{page}页获取失败') page += 1 continue if not data_list or page == 500: break print(f'第{page}页') for data in data_list: department_track.up_date = department_track.spider_date department_track.code = data['SCode'] department_track.name = data['SName'] department_track.list_time = department_track.to_null( data['UPCount']) department_track.buy_sum = department_track.to_null( data['JGBMoney']) department_track.buy_time = department_track.to_null( data['JGBCount']) department_track.sell_time = department_track.to_null( data['JGSCount']) department_track.buy_amount = department_track.to_null( data['JGPBuy']) department_track.up_down = department_track.to_null( data['RChange1M']) department_track.data_save() print( f'机构席位买卖追踪:{department_track.up_date}-{department_track.code}-{department_track.name}-导入完成' ) page += 1 department_track.spider_end() print('end:机构席位买卖追踪')
def business_detail_spider(): business_detail_list = [] business_detail = SuperSpider(host='47.102.40.81', passwd='Abc12345', db='bryframe', table_name='business_detail', field_list=('spider_date', 'up_date', 'code', 'name', 'department_name', 'amount')) business_detail.up_date = business_detail.spider_date page = 1 while True: try: json_data = business_detail.get_html( f'http://data.eastmoney.com/DataCenter_V3/stock2016/ActiveStatistics/pagesize=50,page={page},sortRule=-1,sortType=JmMoney,startDate={business_detail.spider_date},endDate={business_detail.spider_date},gpfw=0,js=var%20data_tab_1.html?rt=25861061', 'GB2312') data_list = business_detail.json_to_py(json_data, deal=True)['data'] except: print(f'第{page}页获取失败') page += 1 continue if not data_list or page == 500: break print(f'第{page}页') for data in data_list: if not data['SName']: continue stock_data_list = business_detail.json_to_py(data['SName']) for stock_data in stock_data_list: if stock_data['CodeName'] not in business_detail_list: business_detail_list.append(stock_data['CodeName']) else: continue business_detail.name = stock_data['CodeName'] business_detail.code = stock_data['SCode'] sql = f'select code from business_detail where code="{business_detail.code}" and spider_date="{business_detail.spider_date}"' same_data = business_detail.sql_search(sql) if same_data: business_detail.sql_search( f'delete from business_detail where code="{business_detail.code}" and spider_date="{business_detail.spider_date}"' ) print( f'重新爬取-{business_detail.spider_date}-{business_detail.code}-{business_detail.name}' ) try: url_code = business_detail.re_find( r'\d+', business_detail.code).__next__().group() except: continue url = f'http://data.eastmoney.com/stock/lhb,{business_detail.spider_date},{url_code}.html' try: detail_data_list = [ i for i in business_detail.data_search( url, '//div[@class="content-sepe"]//td//text()', 'gb2312') if i.strip() and '\r' not in i ] for i in range(6): if '(买入前5名与卖出前5名)' in detail_data_list: error_index = detail_data_list.index( '(买入前5名与卖出前5名)') del detail_data_list[error_index:error_index + 6] except: print( f'{business_detail.code}-{business_detail.name}-获取失败') continue # print(detail_data_list) department_list = [] for i, j in zip(range(1, 1000, 8), range(7, 1000, 8)): try: business_detail.department_name = detail_data_list[i] if business_detail.department_name not in department_list: department_list.append( business_detail.department_name) else: print( f'{business_detail.name}-{business_detail.department_name}-信息重复' ) continue business_detail.amount = detail_data_list[j] # print(business_detail.amount) except: break business_detail.data_save() print( f'每日成交明细——{business_detail.up_date}——{business_detail.code}——{business_detail.name}——{business_detail.department_name}——导入完成' ) page += 1 business_detail.spider_end()
def zjmyqyw_spdier(): company_deque=deque([],maxlen=35) zjmyqyw=SuperSpider() zjmyqyw.source_name='浙江名营企业网' zjmyqyw.fax='-' zjmyqyw.get_request('http://www.zj123.com/') url_list1=['http://www.zj123.com/'+i.replace('1.','{}.') for i in zjmyqyw.data_search('find','.indsort dd a','href')] profession_list=list(zjmyqyw.data_search('find','.indsort dd a')) error_index=profession_list.index('特种印刷') for profession,url1 in zip(profession_list[error_index:],url_list1[error_index:]): for page in range(1,100): print(f'{profession}——第{page}页') try: zjmyqyw.get_request(url1.format(page)) page_judge=zjmyqyw.data_search('find','.sleft .m.m1 .fred').__next__().split()[0] except: print(f'获取第{page}页失败') page+=1 continue if int(page_judge) != page: break url_list2=('http://www.zj123.com/member/VIPContact/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href')) url_list3=('http://www.zj123.com/member/VIPCompany/'+i.split('-')[1]+'/index.htm' for i in zjmyqyw.data_search('find','.listdetail22 .listdetail dt a','href')) #print(url_list2) for url2,url3 in zip(url_list2,url_list3): try: zjmyqyw.get_request(url2) except: continue contact_info_dict={i.split(':')[0].strip():i.split(':')[-1].strip().replace('\xa0','') for i in zjmyqyw.data_search('find','.rkbody table tr')} zjmyqyw.company_name=contact_info_dict['公司名称'] if contact_info_dict['公司名称'] else '-' if zjmyqyw.company_name in company_deque: print('信息重复') continue zjmyqyw.person_name=contact_info_dict['联系人'] if contact_info_dict['联系人'] else '-' zjmyqyw.address=contact_info_dict['地 址'] if contact_info_dict['地 址'] else '-' zjmyqyw.phone_number=contact_info_dict['电 话'] if contact_info_dict['电 话'] else '-' zjmyqyw.qq=contact_info_dict['QQ'] if contact_info_dict['QQ'] else '-' zjmyqyw.website=contact_info_dict['网 址'] if contact_info_dict['网 址'] else '-' try: zjmyqyw.get_request(url3) except: continue company_info_list=list(zjmyqyw.data_search('find','.rkbody table tr td')) company_info_dict={company_info_list[n].strip(': '):company_info_list[n+1].strip(': ') for n in range(0,24,2)} #print(company_info_dict) zjmyqyw.main_product=company_info_dict['主营产品或服务'] if company_info_dict['主营产品或服务'] else '-' zjmyqyw.business_mode=company_info_dict['经营模式'] if company_info_dict['经营模式'] else '-' zjmyqyw.company_type=company_info_dict['企业类型'] if company_info_dict['企业类型'] else '-' zjmyqyw.register_money=company_info_dict['注册资本'] if company_info_dict['注册资本'] else '-' zjmyqyw.register_money=company_info_dict['员工人数'] if company_info_dict['员工人数'] else '-' zjmyqyw.source_page=url2 zjmyqyw.data_save() zjmyqyw.phone_number=contact_info_dict['手机'] if contact_info_dict['手机'] else '-' zjmyqyw.data_save() company_deque.append(zjmyqyw.company_name) print(f'{profession}——第{page}页——{zjmyqyw.company_name}信息导入完成') zjmyqyw.spider_end()
def active_department_spider(): active_department = SuperSpider( host='47.102.40.81', passwd='Abc12345', db='bryframe', table_name='active_department', field_list=('spider_date', 'up_date', 'name', 'buy_number', 'sell_number', 'buy_sum', 'sell_sum', 'business_amount', 'code', 'stock_name')) page = 1 while True: try: json_data = active_department.use_requests_to_html( f'http://data.eastmoney.com/DataCenter_V3/stock2016/ActiveStatistics/pagesize=50,page={page},sortRule=-1,sortType=JmMoney,startDate={active_department.spider_date},endDate={active_department.spider_date},gpfw=0,js=var%20data_tab_1.html?rt=25754772', 'GB2312') data_list = active_department.json_to_py(json_data, deal=True)['data'] except: print(f'第{page}页获取失败') page += 1 continue if not data_list or page == 500: break print(f'第{page}页') for data in data_list: active_department.up_date = active_department.spider_date active_department.name = data['YybName'] active_department.buy_number = active_department.to_null( data['YybBCount']) active_department.sell_number = active_department.to_null( data['YybSCount']) active_department.buy_sum = active_department.to_null( data['Bmoney']) active_department.sell_sum = active_department.to_null( data['Smoney']) active_department.business_amount = active_department.to_null( data['JmMoney']) if not data['SName']: active_department.code = 'null' active_department.stock_name = 'null' active_department.data_save() else: for data_s in active_department.json_to_py(data['SName']): active_department.code = data_s['SCode'] active_department.stock_name = data_s['CodeName'] active_department.data_save() print( f'每日活跃营业部:{active_department.up_date}-{active_department.name}-导入完成' ) page += 1 active_department.spider_end() print('end:每日活跃营业部')
def wl114_spider(): wl114 = SuperSpider() wl114.source_name = '网络114' wl114.business_mode = '-' wl114.register_money = '-' wl114.website = '-' wl114.qq = '-' wl114.get_request('http://www.net114.com/') url_list1 = [ i.replace('.html', '-p-{}.html') for i in wl114.data_search( 'xpath', '//*[@id="product_center_content"]/div/ul/li/p/a', attr='href') if i.endswith('.html') ] profession_list1 = [ i for i in wl114.data_search( 'xpath', '//*[@id="product_center_content"]/div/ul/li/p/a') if i != '更多>>' ] error_index = profession_list1.index('维护工具') url_list2 = (i for i in wl114.data_search( 'xpath', '//*[@id="product_center_content"]/div/ul/li/p/a', attr='href') if not i.endswith('.html')) profession_list2 = (i for i in wl114.data_search( 'xpath', '//*[@id="product_center_content"]/div/ul/li/p/a') if i == '更多>>') for url1, profession1 in zip(url_list1[error_index:], profession_list1[error_index:]): try: wl114.get_request(url1.format(1)) all_page = wl114.data_search( 'find', '.page_p:not(span)').__next__().split('\xa0')[1] except: continue for page in range(1, int(all_page) + 1): print(f'{profession1}——第{page}页') try: wl114.get_request(url1.format(page)) except: continue url_list3 = list( wl114.data_search('find', '.product_list_div_h143 h2 a', 'href')) if not url_list3: break for url3 in url_list3: try: wl114.get_request(url3) company_info_dict = { i.split(':')[0].strip(): i.split(':')[-1].strip() for i in wl114.data_search( 'find', '.right.w_250 .border.p_8 li') if ':' in i } phone_url = wl114.data_search( 'find', '.right.w_250 .border.p_8 li a', 'href').__next__() except: continue wl114.company_type = company_info_dict.get('企业性质', '-') wl114.main_product = company_info_dict.get('企业主营', '-') wl114.address = company_info_dict.get('企业地址', '-') try: wl114.get_request(phone_url) except: continue phone_info_data = wl114.data_search( 'find', 'td[valign="top"]:first-child') try: phone_info_list = phone_info_data.__next__().split('\n') phone_info_dict = { i.split(':')[0].strip(): i.split(':')[-1].strip() for i in phone_info_list if ':' in i } except: continue wl114.company_name = phone_info_dict.get('公司名称', '-') if wl114.company_name == '-': wl114.company_name = phone_info_dict.get('企业名称', '-') wl114.person_name = phone_info_dict.get('联系人', '-') wl114.fax = phone_info_dict.get('传真', '-') wl114.phone_number = phone_info_dict.get('手机', '-') wl114.source_page = url3 wl114.data_save() wl114.phone_number = phone_info_dict.get('联系电话', '-') wl114.data_save() print(f'{profession1}——第{page}页——{wl114.company_name}信息导入完成') page += 1 for url2 in url_list2: try: wl114.get_request(url2) except: continue url_list4 = (i.replace('.html', '-p-{}.html') for i in wl114.data_search( 'find', '.product_w369_list a[href]', 'href')) profession_list4 = wl114.data_search('find', '.product_w369_list a[href]') for profession4, url4 in zip(profession_list4, url_list4): try: wl114.get_request(url4.format(1)) all_page = wl114.data_search( 'find', '.page_p:not(span)').__next__().split('\xa0')[1] except: continue for page in range(1, int(all_page) + 1): print(f'{profession4}——第{page}页') try: wl114.get_request(url4.format(page)) except: continue url_list3 = list( wl114.data_search('find', '.product_list_div_h143 h2 a', 'href')) if not url_list3: break for url3 in url_list3: try: wl114.get_request(url3) company_info_dict = { i.split(':')[0].strip(): i.split(':')[-1].strip() for i in wl114.data_search( 'find', '.right.w_250 .border.p_8 li') if ':' in i } phone_url = wl114.data_search( 'find', '.right.w_250 .border.p_8 li a', 'href').__next__() except: continue wl114.company_type = company_info_dict.get('企业性质', '-') wl114.main_product = company_info_dict.get('企业主营', '-') wl114.address = company_info_dict.get('企业地址', '-') try: wl114.get_request(phone_url) except: continue phone_info_data = wl114.data_search( 'find', 'td[valign="top"]:first-child') try: phone_info_list = phone_info_data.__next__().split( '\n') phone_info_dict = { i.split(':')[0].strip(): i.split(':')[-1].strip() for i in phone_info_list if ':' in i } except: continue wl114.company_name = phone_info_dict.get('公司名称', '-') if wl114.company_name == '-': wl114.company_name = phone_info_dict.get('企业名称', '-') wl114.person_name = phone_info_dict.get('联系人', '-') wl114.fax = phone_info_dict.get('传真', '-') wl114.phone_number = phone_info_dict.get('手机', '-') wl114.source_page = url3 wl114.data_save() wl114.phone_number = phone_info_dict.get('联系电话', '-') wl114.data_save() print( f'{profession4}——第{page}页——{wl114.company_name}信息导入完成') page += 1 wl114.spider_end()
def business_detail_spider(): stock_list = [] business_detail = SuperSpider(host='47.102.40.81', passwd='Abc12345', db='bryframe', table_name='business_detail', field_list=('spider_date', 'up_date', 'code', 'name', 'department_name', 'amount')) business_detail.up_date = business_detail.spider_date page = 1 while True: try: json_data = business_detail.use_requests_to_html( f'http://data.eastmoney.com/DataCenter_V3/stock2016/ActiveStatistics/pagesize=50,page={page},sortRule=-1,sortType=JmMoney,startDate={business_detail.spider_date},endDate={business_detail.spider_date},gpfw=0,js=var%20data_tab_1.html?rt=25861061', 'GB2312') data_list = business_detail.json_to_py(json_data, deal=True)['data'] except: print(f'第{page}页获取失败') page += 1 continue if not data_list or page == 500: break print(f'第{page}页') for data in data_list: if not data['SName']: continue stock_data_list = business_detail.json_to_py(data['SName']) for stock_data in stock_data_list: if stock_data['CodeName'] not in stock_list: stock_list.append(stock_data['CodeName']) else: continue business_detail.name = stock_data['CodeName'] business_detail.code = stock_data['SCode'] try: url_code = business_detail.re_find( r'\d+', business_detail.code).__next__().group() except: continue print(url_code) url = f'http://data.eastmoney.com/stock/lhb,{business_detail.spider_date},{url_code}.html' try: business_detail.get_request(url) except: continue detail_data_list = list( business_detail.data_search('find', 'table tbody td')) for i, j in zip(range(1, 71, 7), range(6, 71, 7)): try: business_detail.department_name = detail_data_list[ i].split('\n')[0] business_detail.amount = detail_data_list[j] except: break business_detail.data_save() print( f'每日成交明细——{business_detail.up_date}——{business_detail.code}——{business_detail.name}——{business_detail.department_name}——导入完成' ) page += 1 business_detail.spider_end()