def parse_company(self, response): item = CompanyItem() item["from_web"] = self.name item["from_url"] = response.url item["area"] = "shenzhen" text_strip = self.text_strip text_join = self.text_join item["name"] = response.xpath( "//div[@class='profile-name']/h1/span/a/text()").extract_first() item["summary"] = text_join( response.xpath("//div[@class='article']/p/text()").extract(), "\n") info_dict = {} try: info_dict.update( info.split(":", maxsplit=1) for info in (text_join(sel.xpath( ".//text()").extract()).replace("\xa0", "") for sel in response.xpath( "//div[@class='contact-info']/ul/li/span"))) item["address"] = info_dict.get("地址") item["telephone"] = info_dict.get("电话", "").strip("-") item["mobile"] = info_dict.get("移动电话", "").strip("-") except Exception: self.logger.exception("") sel_list = response.xpath( "//div[@class='company-info']/table/tbody/tr") if not sel_list: self.notice_change("No data found!!!!! " + response.url) for sel in sel_list: info = [ i for i in sel.xpath("td//text()").extract() if text_strip(i) ] if info: try: info_dict[info[0]] = info[1] except Exception: pass item["name"] = info_dict.get("公司名称") or item["name"] item["found_date"] = info_dict.get("注册时间") item["registered_capital"] = info_dict.get("注册资本") item["employee_scale"] = info_dict.get("公司规模") item["legal_person"] = info_dict.get("法定代表人") item["main_area"] = info_dict.get("年营业额") item["main_products"] = info_dict.get("主营产品") item["company_form"] = info_dict.get("企业类型") item["address"] = info_dict.get("详细地址") or item.get("address") yield item
def parse_company(self, response): item = CompanyItem() item["from_web"] = self.name item["from_url"] = response.url item["area"] = "shenzhen" item["name"] = response.xpath("//div[@class='head']/div/strong/text()").extract_first() main_products = response.xpath("//div[@class='head']/div/h4/text()").extract_first("") item["main_products"] = main_products.split(":", maxsplit=1)[-1] item["summary"] = self.text_join(response.xpath("//table[@cellspacing='3']/tr/td//text()").extract(), "\n") try: info_dict = dict(info.split(":", maxsplit=1) for info in response.xpath("//div[@class='qy_body']//li/text()").extract() if ":" in info) item["company_form"] = info_dict.get("公司类型") item["found_date"] = info_dict.get("成立时间") item["employee_scale"] = info_dict.get("公司规模") item["registered_capital"] = info_dict.get("注册资本") item["address"] = info_dict.get("地址") item["mobile"] = info_dict.get("手机") item["telephone"] = info_dict.get("电话") or info_dict.get("传真") except Exception: self.logger.exception("") yield item
def parse_company(self, response): item = CompanyItem() item["from_web"] = self.name item["from_url"] = response.url item["area"] = "shenzhen" text_join = self.text_join item["name"] = response.xpath( "//a[contains(@class,'businessName')]/@title").extract_first() item["summary"] = self.text_join( response.xpath("//div[@class='compIntro']/p/text()").extract(), "\n") info_dict = {} try: info_dict.update( i.split(":", maxsplit=1) for i in (text_join(sel.xpath(".//text()").extract()) for sel in response.xpath( "//ul[contains(@class,'basicMsgListo')]/li")) if ":" in i) item["company_form"] = info_dict.get("公司性质") item["employee_scale"] = info_dict.get("公司规模") item["legal_person"] = info_dict.get("法人") item["industry"] = info_dict.get("公司行业") item["address"] = info_dict.get("公司地址", "").replace("查看地图", "") except Exception: self.logger.exception("") yield item
def parse_company_name(self, response): spider_name = self.name response_url = response.url urljoin = response.urljoin name_exists_func = self.is_search_name_exists record_name_func = self.record_search_name parse_company = self.parse_company text_strip = self.text_strip sel_list = response.xpath("//li[@class='dqscontit']/a") if not sel_list: self.notice_change("No data found!!!!! " + response_url) for sel in sel_list: try: name = text_strip(sel.xpath("text()").extract_first()) except Exception: continue if name_exists_func(name): continue record_name_func(name) item = CompanyItem() item["from_web"] = spider_name item["area"] = "guangdong" item["name"] = name url = sel.xpath("@href").extract_first("") url = urljoin(url) request = Request(url, callback=parse_company) request.meta["item"] = item yield request
def parse_company_name(self, response): spider_name = self.name response_url = response.url name_exists_func = self.is_search_name_exists record_name_func = self.record_search_name text_strip = self.text_strip sel_list = response.xpath("//td[@class='tItem']") if not sel_list: self.notice_change("No data found!!!!! " + response_url) for sel in sel_list: try: name = text_strip(sel.xpath("a/text()").extract_first()) except Exception: continue if name_exists_func(name): continue record_name_func(name) item = CompanyItem() item["from_web"] = spider_name item["from_url"] = response_url item["area"] = "guangdong" item["name"] = name infos = sel.xpath(".//text()").extract() item["address"] = infos[-1] if infos else None yield item url = response.xpath("//a/b[text()='下一页']/../@href").extract_first() if url: yield Request(url, self.parse_company_name, dont_filter=True)
def parse_company_name(self, response): try: text = response.text if '"state":"ok"' in text: # 成功 spider_name = self.name name_exists_func = self.is_search_name_exists record_name_func = self.record_search_name datas = json_loads(text)["data"] if "items" in datas: for data in datas["items"]: name = data["name"] if not name: continue if name_exists_func(name): continue record_name_func(name) item = CompanyItem() item["from_web"] = spider_name item[ "from_url"] = "http://www.tianyancha.com/company/" + data[ "id"] item["area"] = "shenzhen" item["name"] = name yield item else: self.logger.warning("天眼查---查找相关公司失败,URL(%s)" % response.url) except Exception: self.logger.exception("天眼查---查找相关公司异常,URL(%s)" % response.url)
def parse_company(self, response): item = CompanyItem() item["from_web"] = self.name item["from_url"] = response.url item["area"] = "guangdong" text_join = self.text_join item["name"] = response.xpath("//h1[@class='cd_title']/text()").extract_first() item["mobile"] = response.xpath("//span[@class='cd_mob']/text()").extract_first() item["telephone"] = response.xpath("//span[@class='cd_tel']/text()").extract_first("").strip("-") item["main_products"] = text_join(response.xpath("//span[@class='cd_major_item']/a/text()").extract(), ",") item["summary"] = response.xpath("//div[@class='cl_about']/text()").extract_first() try: info_dict = dict(info.split(":", maxsplit=1) for info in response.xpath("//div[contains(@class,'cd_param')]//span/text()").extract()) item["name"] = info_dict.get("公司名称") or item["name"] item["legal_person"] = info_dict.get("法人代表") item["address"] = info_dict.get("公司地址") item["company_form"] = info_dict.get("公司类型") item["registered_capital"] = info_dict.get("注册资本", "").rstrip("万元") item["found_date"] = info_dict.get("成立时间") item["employee_scale"] = info_dict.get("员工人数") item["annual_turnover"] = info_dict.get("年营业额") item["annual_export_volume"] = info_dict.get("年出口额") item["main_area"] = info_dict.get("主要销售区域") except Exception: self.logger.exception("") yield item
def parse_company(self, response): item = CompanyItem() item["from_web"] = self.name item["from_url"] = response.url item["area"] = "shenzhen" text_join = self.text_join item["name"] = response.xpath( "//div[@class='coInfos']/h1/text()").extract_first() item["summary"] = response.xpath( "//div[@class='coInfos']/div[@id='ciTxt']/text()").extract_first() try: info_dict = dict((info[0], text_join(info[1:])) for info in ( sel.xpath(".//text()").extract() for sel in response.xpath("//div[@class='aiMain']/ul/li"))) item["name"] = info_dict.get("公司名称") or item["name"] item["address"] = info_dict.get("公司地址") item["main_products"] = info_dict.get("主营业务") item["mobile"] = info_dict.get("联系手机") item["telephone"] = info_dict.get("联系电话") except Exception: self.logger.exception("") yield item
def parse_company_contact(self, response): item = CompanyItem() item["from_web"] = self.name item["from_url"] = response.url item["area"] = "shenzhen" text_join = self.text_join item["name"] = response.xpath( "//div[@id='namelogo']/p/text()").extract_first() try: info_dict = dict( info.split(":", maxsplit=1) for info in ( text_join(sel.xpath(".//text()").extract()).replace( "\xa0", "") for sel in response.xpath("//div[@id='contact']/ul/li"))) item["name"] = info_dict.get("公司") or item["name"] item["address"] = info_dict.get("地址") item["telephone"] = info_dict.get( "电话", "").strip("-") or info_dict.get("传真", "").strip("-") item["mobile"] = info_dict.get("手机") except Exception: self.logger.exception("") request = Request( response.url.replace("/companycontact.htm", "/companyabout.htm"), self.parse_company_introduce) request.meta["item"] = item yield request
def parse_company(self, response): item = CompanyItem() item["from_web"] = self.name item["from_url"] = response.url item["area"] = "shenzhen" text_strip = self.text_strip text_join = self.text_join item["name"] = response.xpath( "//h1[@class='company_name']/text()").extract_first() item["summary"] = response.xpath( "//div[@class='qynr']/p/text()").extract_first() info_dict = {} try: info_dict.update( info.split(":", maxsplit=1) for info in (text_join( sel.xpath( "*[starts-with(@class,'xg_cd')]/text()").extract()) for sel in response.xpath( "//div[contains(@class,'dpbj')]/ul/li/dl")) if "资料不详" not in info) item["main_products"] = info_dict.get("主营产品") except Exception: self.logger.exception("") try: info_dict.update( info.split(":", maxsplit=1) for info in (text_join(sel.xpath("*/text()").extract()) for sel in response.xpath( "//ul[contains(@class,'jtxx')]/li/dl")) if "资料不详" not in info) item["mobile"] = info_dict.get("手机") item["telephone"] = info_dict.get("电话") item["address"] = info_dict.get("地址") except Exception: self.logger.exception("") for tr in response.xpath("//table[contains(@class,'xxtb')]/tbody/tr"): k1 = tr.xpath("td[1]/text()").extract_first("") v1 = tr.xpath("td[2]/text()").extract_first("") k2 = tr.xpath("td[3]/text()").extract_first("") v2 = tr.xpath("td[4]/text()").extract_first("") if v1 and '资料不详' not in v1: info_dict[text_strip(k1)] = v1 if v2 and '资料不详' not in v2: info_dict[text_strip(k2)] = v2 item["company_form"] = info_dict.get("企业类型") item["registered_capital"] = info_dict.get("注册资本") item["legal_person"] = info_dict.get("法定代表人/负责人") item["annual_turnover"] = info_dict.get("年营业额") item["employee_scale"] = info_dict.get("员工人数") yield item
def parse_company(self, response): item = CompanyItem() item["from_web"] = self.name item["from_url"] = response.url item["area"] = "shenzhen" name = response.xpath( "//p[@class='companyname']/span/text()").extract_first() if not name: return item["name"] = name text_join = self.text_join info_dict = {} try: info_dict.update( info.split(":", maxsplit=1) for info in ( text_join(sel.xpath(".//text()").extract()) for sel in response.xpath("//div[@class='comBorder']//p")) if ":" in info and "暂未填写" not in info) item["main_products"] = info_dict.get("主营业务") except Exception: self.logger.exception("") try: info_dict.update( info.split(":", maxsplit=1) for info in (text_join(sel.xpath( ".//text()").extract()).replace("\xa0", "") for sel in response.xpath( "//li[contains(@class,'addIntro')]")) if ":" in info and "暂未填写" not in info) item["address"] = info_dict.get("地址") except Exception: self.logger.exception("") try: info_dict.update( info.split(":", maxsplit=1) for info in ( text_join(sel.xpath(".//text()").extract()) for sel in response.xpath("//div[@class='companytxt']/p")) if "暂未填写" not in info) item["company_form"] = info_dict.get("企业类型") item["registered_capital"] = info_dict.get("注册资本") item["legal_person"] = info_dict.get("法定代表人") item["main_products"] = info_dict.get( "主要供应产品") or item["main_products"] item["main_area"] = info_dict.get("主要面向地区") item["employee_scale"] = info_dict.get("员工数量") item["annual_turnover"] = info_dict.get("年营业额") except Exception: self.logger.exception("") yield item
def parse(self, response): spider_name = self.name urljoin = response.urljoin name_exists_func = self.is_search_name_exists record_name_func = self.record_search_name parse_company_contact = self.parse_company_contact text_strip = self.text_strip text_join = self.text_join sel_list = response.xpath("//ul[@class='companyList']/li") if not sel_list: self.notice_change("No data found!!!!! " + response.url) for li in sel_list: name_a = li.xpath("div[@class='tit']/strong/a") try: name = text_strip(name_a.xpath("text()").extract_first()) except Exception: continue if name_exists_func(name): continue record_name_func(name) item = CompanyItem() item["from_web"] = spider_name item["area"] = "guangdong" item["name"] = name try: info_dict = dict( info.split(":", maxsplit=1) for info in (text_join(sel.xpath(".//text()").extract()) for sel in li.xpath("dl[1]/dd"))) item["main_products"] = info_dict.get("主营产品") item["address"] = info_dict.get("企业地址") except Exception: self.logger.exception("") url = name_a.xpath("@href").extract_first("") url = urljoin(url) + "-contact" request = Request(url, callback=parse_company_contact) request.meta["item"] = item yield request url = response.xpath( "//div[@class='matpages']/a[text()='下一页']/@href").extract_first() if url: yield Request(url, self.parse, dont_filter=True) else: yield Request(self.start_urls[0], self.parse, dont_filter=True)
def parse_company(self, response): item = CompanyItem() item["from_web"] = self.name item["from_url"] = response.url item["area"] = "shenzhen" text_join = self.text_join item["name"] = response.xpath("//h1/text()").extract_first() item["address"] = text_join(response.xpath("//p[@class='fp']/text()").extract(), " ") item["summary"] = text_join(response.xpath("//div[@class='con_msg']//p/text()").extract(), "\n") try: item["company_form"], item["employee_scale"], item["industry"] \ = response.xpath("//p[@class='ltype']/text()").extract_first("").split("|") except ValueError: item["company_form"], item["employee_scale"], item["industry"] = "", "", "" yield item
def parse_shop(self, response): text = response.text if "过于频繁" in text or "<p>验证码:<input" in text: return self._too_often_handler(response) item = CompanyItem() item["from_web"] = self.name item["from_url"] = response.url item["area"] = "shenzhen" text_join = self.text_join try: item["summary"] = text_join( response.xpath("//label[@id='ctl00_lab_com_Content']" "//text()").extract()) info_dict = dict( info.split(":", maxsplit=1) for info in (text_join(sel.xpath(".//text()").extract()) for sel in response.xpath( "//div[contains(@class,'com_files')]/ul/li"))) item["name"] = self.text_strip( info_dict.get("公司简称") or response.xpath( "//h1[@class='com_n']/text()[1]").extract_first("")) item["registered_capital"] = info_dict.get("注册资金") item["found_date"] = info_dict.get("建立时间") item["main_products"] = info_dict.get("主营产品") item["employee_scale"] = info_dict.get("员工人数") item["company_form"] = info_dict.get("经营模式") info_dict.update( info.split(":", maxsplit=1) for info in ( text_join(sel.xpath(".//text()").extract()) for sel in response.xpath("//ul[@class='c_l_contact']/li")) if ":" in info) item["address"] = info_dict.get("公司地址") item["telephone"] = info_dict.get("联系电话") item["mobile"] = info_dict.get("移动电话") except Exception: self.logger.exception("") yield item
def parse_company_name(self, response): urljoin = response.urljoin name_exists_func = self.is_search_name_exists record_name_func = self.record_search_name parse_company = self.parse_company text_strip = self.text_strip sel_list = response.xpath("//td[@class='f3']/a[1]") if not sel_list: self.notice_change("No data found!!!!! " + response.url) for sel in sel_list: try: name = text_strip(sel.xpath("strong/text()").extract_first()) except Exception: continue if name_exists_func(name): continue record_name_func(name) url = sel.xpath("@href").extract_first("") url = urljoin(url) item = CompanyItem() item["from_web"] = self.name item["area"] = "shenzhen" item["name"] = name request = Request(url, callback=parse_company) request.meta["item"] = item yield request url = response.xpath("//a[text()='下一页']/@href").extract_first() if url: form_data = parse_qs(response.request.body.decode()) form_data = {k: v[0] for k, v in form_data.items()} form_data["page_change"] = "100" form_data["page_num"] = str(int(form_data.get("page_num", 1)) + 1) yield FormRequest(response.url, self.parse_company_name, dont_filter=True, formdata=form_data)
def parse_company(self, response): item = CompanyItem() item["from_web"] = self.name item["from_url"] = response.url item["area"] = "shenzhen" item["name"] = response.xpath("//h1/text()").extract_first() item["company_form"] = response.xpath( "//span[text()='公司性质:']/../../td[2]/span/text()").extract_first() item["employee_scale"] = response.xpath( "//span[text()='公司规模:']/../../td[2]/span/text()").extract_first() item["industry"] = response.xpath( "//span[text()='公司行业:']/../../td[2]/span/text()").extract_first() item["address"] = response.xpath( "//span[text()='公司地址:']/../../td[2]/span/text()").extract_first() item["summary"] = self.text_join( response.xpath( "//div[@class='company-content']//text()").extract()) yield item
def parse(self, response): spider_name = self.name response_url = response.url name_exists_func = self.is_search_name_exists record_name_func = self.record_search_name parse_company = self.parse_company text_strip = self.text_strip sel_list = response.xpath("//div[@class='jobs-list-box']/div/a") if not sel_list: self.notice_change("No data found!!!!! " + response_url) for sel in sel_list: try: name = text_strip(sel.xpath("text()").extract_first()) except Exception: continue if name_exists_func(name): continue record_name_func(name) url = sel.xpath("@href").extract_first("") if "//special.zhaopin.com/" in url: item = CompanyItem() item["from_web"] = spider_name item["from_url"] = response_url item["area"] = "shenzhen" item["name"] = name yield item else: yield Request(url, callback=parse_company) url = response.xpath( "//div[contains(@class,'pageBar')]/span/a[@title='下一页']/@href" ).extract_first() if url: url = response.urljoin(url) yield Request(url, self.parse, dont_filter=True) else: yield Request(self.start_urls[0], self.parse, dont_filter=True)
def parse_company(self, response): item = CompanyItem() item["from_web"] = self.name item["from_url"] = response.url item["area"] = "shenzhen" text_join = self.text_join name = response.xpath( "//div[@class='companyname']/h1/text()").extract_first() if name: # 模板1 item["name"] = name item["summary"] = text_join( (text_join(sel.xpath(".//text()").extract()) for sel in response.xpath( "//div[contains(@class,'shopcontent')]/p")), "\n") try: info_dict = dict( info.split(":", maxsplit=1) for info in ( text_join(sel.xpath(".//text()").extract()) for sel in response.xpath( "//div[contains(@class,'contact')]/ul/li"))) item["address"] = info_dict.get("公司地址") item["telephone"] = info_dict.get("公司传真") except Exception: self.logger.exception("") yield item else: # 模板2 name = response.xpath( "//div[@id='companyname']/h1/a/text()").extract_first() if name: item["name"] = name request = Request(response.url.replace("/shop/", "/contact/"), self.parse_company_contact) request.meta["item"] = item yield request else: self.logger.error("Unknown template: " + response.url) return
def parse(self, response): spider_name = self.name response_url = response.url name_exists_func = self.is_search_name_exists record_name_func = self.record_search_name parse_company = self.parse_company text_strip = self.text_strip sel_list = response.xpath("//div[@class='com-list-2']/table/tr/td/a") if not sel_list: self.notice_change("No data found!!!!! " + response_url) for sel in sel_list: try: name = text_strip(sel.xpath("@title").extract_first()) except Exception: continue if name_exists_func(name): continue record_name_func(name) item = CompanyItem() item["from_web"] = spider_name item["area"] = "shenzhen" item["name"] = name url = sel.xpath("@href").extract_first("") request = Request(url, callback=parse_company) request.meta["item"] = item yield request url = response.xpath( "//ul[contains(@class,'pageLink')]//a[@class='next']/@href" ).extract_first() if url: url = response.urljoin(url) yield Request(url, self.parse, dont_filter=True) else: yield Request(self.start_urls[0], self.parse, dont_filter=True)
def parse_company_name(self, response): spider_name = self.name response_url = response.url name_exists_func = self.is_search_name_exists record_name_func = self.record_search_name parse_company = self.parse_company text_strip = self.text_strip sel_list = response.xpath("//form[@id='jubao']/dl/dt/h4/a") if not sel_list: self.notice_change("No data found!!!!! " + response_url) for sel in sel_list: try: name = text_strip(sel.xpath("text()").extract_first()) except Exception: continue if name_exists_func(name): continue record_name_func(name) item = CompanyItem() item["from_web"] = spider_name item["area"] = "shenzhen" item["name"] = name url = sel.xpath("@href").extract_first("") request = Request(url + "company_detail.html", callback=parse_company) request.meta["item"] = item yield request url = response.xpath( "//div[contains(@class,'page_tag')]/a[text()='下一页']/@href" ).extract_first() if url: yield Request(url, self.parse_company_name, dont_filter=True)
def parse_company(self, response): text = response.text if "过于频繁" in text or "<p>验证码:<input" in text: return self._too_often_handler(response) item = CompanyItem() item["from_web"] = self.name item["from_url"] = response.url item["area"] = "shenzhen" item["name"] = self.text_strip( response.xpath( "//div[@class='headcont']//h1/text()").extract_first()) text_join = self.text_join try: item["summary"] = text_join( response.xpath("//div[@class='hyinfo_detail_txt_files']" "//text()").extract()) info_dict = dict( info.split(":", maxsplit=1) for info in (text_join(sel.xpath(".//text()").extract()) for sel in response.xpath( "//li[@class='hyinfo_d_job_list_li']"))) item["found_date"] = info_dict.get("成立日期") item["main_products"] = info_dict.get("主营产品") item["address"] = info_dict.get("公司注册地址") item["telephone"] = info_dict.get("电话") or info_dict.get("传真") item["mobile"] = info_dict.get("业务经理手机") item["registered_capital"] = info_dict.get("注册资金") item["employee_scale"] = info_dict.get("员工数量") item["legal_person"] = info_dict.get("法人") item["company_form"] = info_dict.get("公司类型") except Exception: self.logger.exception("") yield item
def parse(self, response): spider_name = self.name response_url = response.url name_exists_func = self.is_search_name_exists record_name_func = self.record_search_name text_strip = self.text_strip sel_list = response.xpath("//nobr/a/text()").extract() if not sel_list: self.notice_change("No data found!!!!! " + response_url) for name in sel_list: name = text_strip(name) if name_exists_func(name): continue record_name_func(name) item = CompanyItem() item["from_web"] = spider_name item["from_url"] = response_url item["area"] = "shenzhen" item["name"] = name yield item
def parse(self, response): spider_name = self.name response_url = response.url name_exists_func = self.is_search_name_exists record_name_func = self.record_search_name text_strip = self.text_strip sel_list = response.xpath("//div[contains(@class,'itemlist')]/h2/a") if not sel_list: self.notice_change("No data found!!!!! " + response_url) for sel in sel_list: try: name = text_strip(sel.xpath("text()").extract_first()) except Exception: continue if len(name) > 50: continue if name_exists_func(name): continue record_name_func(name) item = CompanyItem() item["from_web"] = spider_name item["from_url"] = response_url item["area"] = "shenzhen" item["name"] = name yield item url = response.xpath( "//div[@class='pagelist']/a[text()='后页']/@href").extract_first() if url: yield Request(url, self.parse, dont_filter=True) else: yield Request(self.start_urls[0], self.parse, dont_filter=True)