def crawl_proxyGoubanjia(self): """ 获取代理 http://www.goubanjia.com/ :return: """ start_url = "http://www.goubanjia.com/" html = get_page(start_url) # root = etree.strip_elements() # print(html) if html: htmlEle = etree.HTML(html, etree.HTMLParser()) result = htmlEle.xpath("//td[@class='ip']") # print(result) # print(len(result)) for td in result: # p = td.xpath(".//p[@style='display:none;']") # p1 = td.xpath(".//p[@style='display: none;']") etree.strip_elements(td, 'p') # print(etree.tostring(td)) # print(len(p)) # print(len(p1)) text = td.xpath(".//text()") # print(text) # print(''.join(text)) yield ''.join(text)
def data_from_vm(): """Parse information from url """ urls = db_util.get_urls() for url in urls: print url content = util.get_page(url.name, CHARSETVM) if content is not None: type_poster = url.type_url soup = util.html_soup(content) page = _count_page_vm_url(soup) _insert_data_vm_to_db(_parser_vm_data(soup), type_poster) if page > 1: for i in range(2, page+1): link = '{0}&page={1}'.format(url.name, i) content = util.get_page(link, CHARSETVM) soup_inner = util.html_soup(content) _insert_data_vm_to_db(_parser_vm_data(soup_inner), type_poster)
def order_list(): ''' 显示订货信息列表 :return: ''' if request.method == 'GET': orders, paginate = get_page(request, Order, 'order_id') # 返回获取到的事务信息给前端页面 return render_template('home/order_list.html', orders=enumerate(orders), paginate=paginate)
def supplier_list(): ''' 显示供应商信息列表 :return: ''' if request.method == 'GET': suppliers, paginate = get_page(request, Supplier, 'supplier_id') # 返回获取到的事务信息给前端页面 return render_template('home/supplier_list.html', suppliers=enumerate(suppliers), paginate=paginate)
def part_list(): ''' 显示零件信息列表 :return: ''' if request.method == 'GET': parts, paginate = get_page(request, Part, 'part_id') suppliers = Supplier.query.order_by('supplier_id').all() # 返回获取到的零件信息给前端页面 return render_template('home/part_list.html', parts=enumerate(parts), suppliers=suppliers, paginate=paginate)
def store_list(): ''' 显示库存清单列表 :return: ''' if request.method == 'GET': stores, paginate = get_page(request, Store, 'store_id') parts = Part.query.with_entities( Part.part_id, Part.part_name).order_by('part_id').all() # 返回获取到的事务信息给前端页面 return render_template('home/store_list.html', stores=enumerate(stores), parts=parts, paginate=paginate)
def affair_list(): ''' 显示事务信息列表 :return: ''' if request.method == 'GET': affair_ls, paginate = get_page(request, Affair, 'affair_id') orders = Order.query.filter( Order.order_status == 1, Order.purchased_num < Order.order_num).all() # 返回获取到的事务信息给前端页面 return render_template('home/affair_list.html', affair_ls=enumerate(affair_ls), paginate=paginate, orders=orders)
def crawl_kuaidaili(self, page_count=3000): """ https://www.kuaidaili.com/free/inha/1/ :param page_count: :return: """ start_url = "https://www.kuaidaili.com/free/inha/{}/" urls = [start_url.format(page) for page in range(1, page_count + 1)] for url in urls: import time time.sleep(2) print("Crawler : ", url) html = get_page(url) if html: htmlEle = etree.HTML(html, etree.HTMLParser()) result_ip = htmlEle.xpath("//div[@id='list']//table//tr/td[1]/text()") result_port = htmlEle.xpath("//div[@id='list']//table//tr/td[2]/text()") # print(result_ip) # print(result_port) for i in range(0, len(result_ip)): # print(':'.join([result_ip[i], result_port[i]])) yield ':'.join([result_ip[i], result_port[i]])
def crawl_daili66(self, page_count=2000): """ 获取代理66 :param page_count: 页码 :return: """ start_url = "http://www.66ip.cn/{}.html" urls = [start_url.format(page) for page in range(1, page_count +1) ] for url in urls: print("Crawler : ", url) html = get_page(url) time.sleep(1) # print(html) if html: htmlEle = etree.HTML(html, etree.HTMLParser()) result_ip = htmlEle.xpath("//div[@id='main']//table//tr[position()>1]/td[1]/text()") result_port = htmlEle.xpath("//div[@id='main']//table//tr[position()>1]/td[2]/text()") # result1 = html.xpath('//li[contains(@class,"aaa")]/a/text()') # print(result_ip) # print(result_port) for i in range(0, len(result_ip)): # print(':'.join([result_ip[i], result_port[i]])) yield ':'.join([result_ip[i], result_port[i]])