def data_url(self): """ 每页中的详情页的url :return: """ count = red_cli.scard('category_urls') while count: try: #没爬取一次分类换一个IP proxy = choice(get_proxy())["ip"] proxy = { 'http': 'http://{}'.format(proxy), 'https': 'https://{}'.format(proxy) } data_category = red_cli.srandmember('category_urls') item = EasyDict() item.category_name = eval(data_category)["category"] city_url = eval(data_category)["city_url"] resp = self.feach.get_req(url=city_url, proxies=proxy) if resp != False: etre = HTML(resp) hygate_urls = etre.xpath( '//a/self::a[@class="proName02"]/@href') company_names = etre.xpath( '//a/self::a[@class="proName02"]/text()') tag = 0 if len(hygate_urls) >= 1: for _ in range(len(hygate_urls)): item.company_url = "http://www.zhaoshang100.com" + hygate_urls[ _] item.company_name = company_names[_] red_cli.sadd('100_zsw_info_url', str(item)) log.info("首页数据-{}-存入redis完成".format(item)) log.info("初始页面公司数据url抓取完毕,准备抓取分页") self.page(item, city_url, proxy) else: log.info("该城市分类暂未数据") else: tag = 1 if tag == 1: print('请求失败') pass else: pprint('数据插入redis全部成功') red_cli.srem('category_urls', data_category) count -= 1 except: pass
def hygate(self): """ 行业,分类 :return: """ count = red_cli.scard('jr_city') while count: # 没爬取一次分类换一个IP proxy = choice(get_proxy())["ip"] proxy = { 'http': 'http://{}'.format(proxy), 'https': 'https://{}'.format(proxy) } data_category = red_cli.srandmember('jr_city') province = eval(data_category)["category"] city = eval(data_category)["city_name"] city_url = eval(data_category)["city_url"] tag = 0 item = EasyDict() item.province = province item.city = city resp = self.feach.get_req(url=city_url, proxies=proxy) if resp != False: etre = HTML(resp) hygate_urls = etre.xpath( '//div[@id="sidebar"]/div/div/div//a/@href') hygate_names = etre.xpath( '//div[@id="sidebar"]/div/div/div//a/text()') for _ in range(len(hygate_names)): if hygate_names[_] == "全部": continue else: item.city_url = "https://www.jvrong.com" + str( hygate_urls[_]) item.city_name = hygate_names[_] red_cli.sadd('jr_hycate', str(item)) log.info("数据插入成功{}".format(item)) else: tag = 1 if tag == 1: print('请求失败') pass else: pprint('数据插入redis全部成功') red_cli.srem('jr_city', data_category) count -= 1
def data_url(self): """ 每页中的详情页的url :return: """ count = red_cli.scard('jr_hycate') while count: #没爬取一次分类换一个IP proxy = choice(get_proxy())["ip"] proxy = { 'http': 'http://{}'.format(proxy), 'https': 'https://{}'.format(proxy) } data_category = red_cli.srandmember('jr_hycate') item = EasyDict() item.province = eval(data_category)["province"] item.city = eval(data_category)["city"] item.category_name = eval(data_category)["city_name"] city_url = eval(data_category)["city_url"] resp = self.feach.get_req(url=city_url, proxies=proxy) if resp != False: etre = HTML(resp) hygate_urls = etre.xpath( '//div[@class="list-item-content"]/div[1]/a/@href') hygate_names = etre.xpath( '//div[@class="list-item-content"]/div[1]/a/text()') tag = 0 if len(hygate_urls) >= 1: for _ in range(len(hygate_names)): item.company_url = hygate_urls[_] item.company_name = hygate_names[_] red_cli.sadd('jr_info_url', str(item)) if len(hygate_urls) >= 30: self.page(item, city_url, proxy) else: log.info("该城市分类暂未数据") else: tag = 1 if tag == 1: print('请求失败') pass else: pprint('数据插入redis全部成功') red_cli.srem('jr_hycate', data_category) count -= 1
def city(self): """ 城市 :return: """ count = red_cli.scard('jr_category') while count: #没爬取一次分类换一个IP proxy = choice(get_proxy())["ip"] proxy = { 'http': 'http://{}'.format(proxy), 'https': 'https://{}'.format(proxy) } data_category = red_cli.srandmember('jr_category') cate = eval(data_category)["category_name"] cate_url = eval(data_category)["category_url"] tag = 0 item = EasyDict() item.category = cate resp = self.feach.get_req(url=cate_url, proxies=proxy) if resp != False: etre = HTML(resp) city_urls = etre.xpath( '//div[@class="filter-item"]/div[last()]/a/@href') city_names = etre.xpath( '//div[@class="filter-item"]/div[last()]/a/text()') for _ in range(len(city_names)): if city_names[_] == "全部": continue else: item.city_url = "https://www.jvrong.com" + str( city_urls[_]) item.city_name = city_names[_] red_cli.sadd('jr_city', str(item)) else: tag = 1 if tag == 1: print('请求失败') pass else: pprint('数据插入redis全部成功') red_cli.srem('jr_category', data_category) count -= 1
def hygate(self): """ 行业,分类 :return: """ count = red_cli.scard('100_zsw_category') while count: # 没爬取一次分类换一个IP proxy = choice(get_proxy())["ip"] proxy = { 'http': 'http://{}'.format(proxy), 'https': 'https://{}'.format(proxy) } data_category = red_cli.srandmember('100_zsw_category') cate_names = eval(data_category)["category_name"] city_url = eval(data_category)["urls"] tag = 0 item = EasyDict() item.category = cate_names resp = self.feach.get_req(url=city_url, proxies=proxy) if resp != False: etre = HTML(resp) hygate_urls = etre.xpath( '//ul/self::ul[@id="ul_i"]/li/a/@href') hygate_names = etre.xpath( '//ul/self::ul[@id="ul_i"]/li/a/@title') for _ in range(len(hygate_names)): if hygate_names[_] == "全部": continue else: item.city_url = "http://www.zhaoshang100.com" + str( hygate_urls[_]) red_cli.sadd('category_urls', str(item)) log.info("数据插入成功{}".format(item)) else: tag = 1 if tag == 1: print('请求失败') pass else: pprint('数据插入redis全部成功') red_cli.srem('100_zsw_category', data_category) count -= 1
def page(self, item, city_url, proxy): """ 分页 :param item: :param resp: :param proxy: :return: """ page_num = 2 base_url = city_url city_url = base_url + str(page_num) company_md5 = '' while 1: resp = self.feach.get_req(url=city_url, proxies=proxy) if resp != False: etre = HTML(resp) hygate_urls = etre.xpath( '//a/self::a[@class="proName02"]/@href') company_names = etre.xpath( '//a/self::a[@class="proName02"]/text()') companys = hashlib.md5(str( company_names[0]).encode('utf-8')).hexdigest() if companys != company_md5: for _ in range(len(hygate_urls)): item.company_url = "http://www.zhaoshang100.com" + hygate_urls[ _] item.company_name = company_names[_] company_md5 = hashlib.md5( str(company_names[0]).encode('utf-8')).hexdigest() red_cli.sadd('100_zsw_info_url', str(item)) page_num += 1 city_url = base_url + str(page_num) log.info('第{}页数据存储完毕'.format(page_num - 1)) else: log.info("该类别抓取完毕") break else: log.info("抓取分页过程中代理失效更换代理") proxy = choice(get_proxy())["ip"] proxy = { 'http': 'http://{}'.format(proxy), 'https': 'https://{}'.format(proxy) }
def data_url(self): """ 每页中的详情页的url :return: """ count = red_cli.scard('wl_114_category') while count: #没爬取一次分类换一个IP proxy = choice(get_proxy())["ip"] proxy = { 'http': 'http://{}'.format(proxy), 'https': 'https://{}'.format(proxy) } data_category = red_cli.srandmember('wl_114_category') item = EasyDict() item.category_name = eval(data_category)["category_name"] city_url = eval(data_category)["urls"] resp = self.feach.get_req(url=city_url, proxies=proxy) if resp != False: etre = HTML(resp) hygate_urls = etre.xpath( '//div/self::div[@class="enter-title"]/h1/a/@href') tag = 0 if len(hygate_urls) >= 1: for _ in range(len(hygate_urls)): item.company_url = hygate_urls[_] red_cli.sadd('wl_114_info_url', str(item)) log.info("首页数据-{}-存入redis完成".format(item)) log.info("初始页面公司数据url抓取完毕,准备抓取分页") self.page(item, city_url, proxy) else: log.info("该城市分类暂未数据") else: tag = 1 if tag == 1: print('请求失败') pass else: pprint('数据插入redis全部成功') red_cli.srem('wl_114_category', data_category) count -= 1
def page(self, item, city_url, proxy): """ 分页 :param item: :param resp: :param proxy: :return: """ page_num = 2 city_url = city_url.replace(".htm", "-p-{}.htm".format(page_num)) while 1: resp = self.feach.get_req(url=city_url, proxies=proxy) if resp != False: etre = HTML(resp) hygate_urls = etre.xpath( '//div/self::div[@class="enter-title"]/h1/a/@href') if len(hygate_urls) >= 20: for _ in range(len(hygate_urls)): item.company_url = hygate_urls[_] red_cli.sadd('wl_114_info_url', str(item)) log.info("第{}页抓取完毕".format(page_num)) page_num += 1 if page_num > 100: break else: city_url = city_url.replace( "{}.htm".format(page_num - 1), "{}.htm".format(page_num)) else: for _ in range(len(hygate_urls)): item.company_url = hygate_urls[_] red_cli.sadd('wl_114_info_url', str(item)) log.info("该类别抓取完毕") break else: log.info("抓取分页过程中代理失效更换代理") proxy = choice(get_proxy())["ip"] proxy = { 'http': 'http://{}'.format(proxy), 'https': 'https://{}'.format(proxy) }
def page(self, item, city_url, proxy): """ 分页 :param item: :param resp: :param proxy: :return: """ page_num = 2 city_url = city_url.replace(".htm", "_{}.htm".format(page_num)) while 1: resp = self.feach.get_req(url=city_url, proxies=proxy) if resp != False: etre = HTML(resp) hygate_urls = etre.xpath( '//div[@class="list-item-content"]/div[1]/a/@href') hygate_names = etre.xpath( '//div[@class="list-item-content"]/div[1]/a/text()') if len(hygate_urls) == 30: for _ in range(len(hygate_names)): item.company_url = hygate_urls[_] item.company_name = hygate_names[_] red_cli.sadd('jr_info_url', str(item)) page_num += 1 if page_num > 100: break else: city_url = city_url.replace( "{}.htm".format(page_num - 1), "{}.htm".format(page_num)) else: for _ in range(len(hygate_names)): item.company_url = hygate_urls[_] item.company_name = hygate_names[_] red_cli.sadd('jr_info_url', str(item)) break else: proxy = choice(get_proxy())["ip"] proxy = { 'http': 'http://{}'.format(proxy), 'https': 'https://{}'.format(proxy) }
def data_info(self): """ 抓取详情页里面的内容 :return: """ count = red_cli.scard('wl_114_info_url') while count: try: tag = 0 #过滤redis中的假的url info_data = red_cli.srandmember('wl_114_info_url') category_name = eval(info_data)["category_name"] url = eval(info_data)["company_url"] # 没爬取一次分类换一个IP proxy = choice(get_proxy())["ip"] proxy = { 'http': 'http://{}'.format(proxy), 'https': 'https://{}'.format(proxy) } resp = self.feach.get_req(url=url, proxies=proxy) if resp != False: item = EasyDict() etre = HTML(resp) try: companyName_pat = re.compile( r'<div class="top_text_title">(.*?)</div>|<dl class="infobox-title"><dt><h1>(.*?)</h1>', re.M) for _ in re.findall(companyName_pat, resp): for x in _: if x: item.companyName = x item._id = hashlib.md5( str(item.companyName).encode( encoding='utf-8')).hexdigest() outName_pat = re.compile( r'</span>(.*?)</dd><dd ><span>联系电话|<p>联系人:(.*?) (市场部经理)</p>' ) for outNames in re.findall(outName_pat, resp): for outName in outNames: if outName: item.outName = outName item.companyUrl = url Addr_pat = re.compile( r'<p>地 址:(.*?)</p>|<span>详细地址:</span>(.*?)</dd>') for companyAddrs in re.findall(Addr_pat, resp): for companyAddr in companyAddrs: if companyAddr: item.companyAddr = companyAddr if item.companyAddr: try: result = cpca.transform([item.companyAddr]) item.companyProvince = result["省"][0] item.companyCity = result["市"][0] except: item.companyProvince = '' item.companyCity = '' item.companyIndustry = eval(info_data)["category_name"] item.websource = "http://corp.net114.com/" item.flag = None imTel = [] companyTel = [] phone = etre.xpath( '//span[contains(text(),"联系电话")]/ancestor::dd/text()' ) if phone == []: phone = etre.xpath( '//font[contains(text(),"电话")]/text()') if phone != []: phone = re.findall(r'电话:(.*?)', phone[0]) if int(phone[0][0]) != 1: imTel.append(phone[0]) else: companyTel.append(phone[0]) item.imTel = imTel item.companyTel = companyTel DB.mongo_add(item) tag = 1 log.info("数据存储成功-{}".format(item)) except: log.info('解析异常,请查看url--{}'.format(item.companyUrl)) tag = 0 else: log.info("请求超时") if tag == 1: red_cli.srem('wl_114_info_url', info_data) print('删除redis中该条详情url成功') count -= 1 except: log.info('页面错误,请查看规则{}'.format(url))
def run_collection(url): '''start to run the spider''' response = requests.get(url, headers=headers, proxies=get_proxy()) soup = BeautifulSoup(response.content, 'lxml') get_investevents_and_insertdb(soup)
def data_info(self): """ 抓取详情页里面的内容 :return: """ count = red_cli.scard('jr_info_url') while count: tag = 0 #过滤redis中的假的url info_data = red_cli.srandmember('jr_info_url') url = 'https://www.jvrong.com' + str( eval(info_data)["company_url"]) judge = url[-4:] if judge != '.htm': red_cli.srem('jr_info_url', info_data) log.info("该网站不正确,删除这条数据sucess.....") count -= 1 else: # 没爬取一次分类换一个IP proxy = choice(get_proxy())["ip"] proxy = { 'http': 'http://{}'.format(proxy), 'https': 'https://{}'.format(proxy) } resp = self.feach.get_req(url=url, proxies=proxy) if resp != False: item = EasyDict() etre = HTML(resp) try: item.companyName = "".join( etre.xpath( '//tbody[@id="meta-table"]/tr[1]/td[last()]/a/text()' )) item._id = hashlib.md5( str(item.companyName).encode( encoding='utf-8')).hexdigest() item.outName = "".join( etre.xpath( '//tbody[@id="meta-table"]/tr[2]/td[last()]/text()' )) item.companyUrl = url item.companyCity = eval(info_data)["city"] item.companyProvince = eval(info_data)["province"] item.companyIndustry = eval(info_data)["category_name"] item.websource = "https://www.jvrong.com/" item.flag = None #地址/电话/座机号 单独处理 companyAddr = etre.xpath( '//tbody[@id="meta-table"]/tr[3]/td[last()]//text()' ) item.companyAddr = companyAddr[4].replace( '\r', '').replace('\t', '').replace('\n', '') #扫描整个字符串是否是座机号 companyTels = etre.xpath( '//tbody[@id="meta-table"]/tr[last()]/td[last()]/p/strong[2]/text()' ) imTel = [] companyTel = [] if companyTels != []: for _ in companyTels: result = re.search('-', _) if result: imTel.append(_) else: if _[0] == '0' or _[0] == '4': imTel.append(_) companyTel.append(_) imTels = etre.xpath( '//tbody[@id="meta-table"]/tr[last()]/td[last()]/p/strong[1]/text()' ) if imTels != []: for _ in imTels: result = re.search('-', _) if result: imTel.append(_) else: if _[0] == '0' or _[0] == '4': imTel.append(_) companyTel.append(_) item.imTel = imTel item.companyTel = companyTel DB.mongo_add(item) tag = 1 log.info("数据存储成功-{}".format(item)) except: log.info('解析异常,请查看url--{}'.format(item.companyUrl)) tag = 0 else: log.info("请求超时") if tag == 1: red_cli.srem('jr_info_url', info_data) print('删除redis中该条详情url成功') count -= 1
def data_info(self): """ 抓取详情页里面的内容 :return: """ count = red_cli.scard('100_zsw_info_url') while count: tag = 0 #过滤redis中的假的url info_data = red_cli.srandmember('100_zsw_info_url') category_name = eval(info_data)["category_name"] url = eval(info_data)["company_url"] companynamse = eval(info_data)["company_name"] # 没爬取一次分类换一个IP proxy = choice(get_proxy())["ip"] proxy = { 'http': 'http://{}'.format(proxy), 'https': 'https://{}'.format(proxy) } resp = self.feach.get_req(url=url, proxies=proxy) if resp != False: item = EasyDict() etre = HTML(resp) try: item.companyName = companynamse item._id = hashlib.md5( str(item.companyName).encode('utf-8')).hexdigest() item.companyUrl = url item.companyIndustry = category_name item.websource = 'http://www.zhaoshang100.com/qiye/' item.flag = None #联系人 tag = 1 if tag == 1: outName = etre.xpath( '//i/self::i[contains(text(),"联系人")]/ancestor::li/text()' ) if outName != []: item.outName = outName[0] else: tag = 2 if tag == 2: log.info('请查看页面规则-{}'.format(url)) # 地址 tag = 1 if tag == 1: companyAddr = etre.xpath( '//i/self::i[contains(text(),"公司地址")]/ancestor::li/text()' ) if companyAddr != []: item.companyAddr = companyAddr[0] else: tag = 2 if tag == 2: log.info('请查看页面规则-{}'.format(url)) #联系电话 tag = 1 if tag == 1: phone_tel = etre.xpath( '//i/self::i[contains(text(),"联系电话")]/ancestor::li/text()' ) if phone_tel != []: if phone_tel[0][0] != '1': item.imTel = [phone_tel[0]] else: item.companyTel = [phone_tel[0]] else: error_phone = etre.xpath( '//i/self::i[contains(text(),"联系电话")]/ancestor::li/font/text()' ) if error_phone: if error_phone[ 0] == '未提交认证,系统隐藏联系方式,登陆后台提交即可恢复': red_cli.srem('100_zsw_info_url', info_data) print('删除redis中该条详情url成功') count -= 1 item.imTel = [] else: tag = 2 tag = 1 #手机号 companyTels = etre.xpath( '//i/self::i[contains(text(),"联系手机")]/ancestor::li/text()' ) if companyTels != []: item.companyTel = [companyTels[0]] else: item.companyTel = [] #地址 if item.companyAddr: try: result = cpca.transform([item.companyAddr]) item.companyProvince = result["省"][0] item.companyCity = result["市"][0] except: item.companyProvince = '' item.companyCity = '' if item.imTel == [] and item.companyTel == []: tag = 1 log.info('没有手机号和座机号删除') else: tag = 1 DB.mongo_add(item) except: etree = HTML(resp) error = etree.xpath('////title/text()')[0] if error == '100招商网 错误页面 404 Not Found': red_cli.srem('wl_114_info_url', info_data) print('错误页面,删除该数据{}'.format(eval(info_data))) count -= 1 log.info('解析异常,请查看url--{}'.format(item.companyUrl)) tag = 0 else: log.info("请求超时") if tag == 1: red_cli.srem('100_zsw_info_url', info_data) print('删除redis中该条详情url成功') count -= 1
def get_appid(url): response = requests.get(url, headers=headers, proxies=get_proxy()).content soup = BeautifulSoup(response, 'lxml') appid = soup.select_one('div[class="mart10"]').select( 'div')[3].span.get_text() return appid
def run_spider(url): response = requests.get(url, headers=headers, proxies=get_proxy()).content soup = BeautifulSoup(response, 'lxml') result = get_and_insertdb_persons(soup) if result is None: pass
def run_spider(url): response = requests.get(url, headers=headers, proxies=get_proxy()) soup = BeautifulSoup(response.content, 'lxml') get_simple_message(soup)
def data_info(self): """ 抓取详情页里面的内容 :return: """ count = red_cli.scard('snw_info_url') while count: tag = 0 #过滤redis中的假的url info_data = red_cli.srandmember('snw_info_url') url = str(eval(info_data)["company_url"]).split( "member")[0] + "member" + "/contact" + str( eval(info_data)["company_url"]).split("member")[1] # 没爬取一次分类换一个IP proxy = choice(get_proxy())["ip"] proxy = { 'http': 'http://{}'.format(proxy), 'https': 'https://{}'.format(proxy) } resp = self.feach.get_req(url=url, proxies=proxy) if resp != False: item = EasyDict() etre = HTML(resp) try: item.companyName = "".join( etre.xpath('//strong[@id="lblCompany"]/text()')) item._id = hashlib.md5( str(item.companyName).encode( encoding='utf-8')).hexdigest() item.outName = "".join( re.findall( r'<a href="../#" id="namecard_linkman" target="_blank">廖伟荣</a> <span id="namecard_sex"></span> ( <span id="namecard_duty">经理</span> )</p></td>', str(resp))) name = "".join( re.findall( r'<a href="../#" id="namecard_linkman" target="_blank">(.*?)</a>', resp, re.M)) sex = "".join( re.findall(r'<span id="namecard_sex">(.*?)</span>', resp, re.M)) item.outName = name + sex item.companyUrl = url item.companyAddr = "".join( etre.xpath( '//span/self::span[@id="namecard_addr"]/text()')) if item.companyAddr: try: result = cpca.transform([item.companyAddr]) item.companyProvince = result["省"][0] item.companyCity = result["市"][0] except: item.companyProvince = '' item.companyCity = '' item.companyIndustry = eval(info_data)["category_name"] item.websource = "http://www.sn180.com/default.html" item.flag = None #扫描整个字符串是否是座机号 companyTels = etre.xpath('//td[@id="namecard_tel"]/text()') imTel = [] companyTel = [] if companyTels != []: for _ in companyTels: if int(_[0]) != 1: imTel.append(_) else: companyTel.append(_) imTels = etre.xpath('//td[@id="namecard_mobile"]/text()') if imTels != []: for _ in imTels: if int(_[0]) != 1: imTel.append(_) else: companyTel.append(_) item.imTel = imTel item.companyTel = companyTel DB.mongo_add(item) tag = 1 log.info("数据存储成功-{}".format(item)) except: log.info('解析异常,请查看url--{}'.format(item.companyUrl)) tag = 0 else: log.info("请求超时") if tag == 1: red_cli.srem('snw_info_url', info_data) print('删除redis中该条详情url成功') count -= 1