Python CompanyItem示例，crawler_bqjr.items.company_items.CompanyItem Python示例

示例#1

0

显示文件

    def parse_company(self, response):
        item = CompanyItem()
        item["from_web"] = self.name
        item["from_url"] = response.url
        item["area"] = "shenzhen"

        text_strip = self.text_strip
        text_join = self.text_join

        item["name"] = response.xpath(
            "//div[@class='profile-name']/h1/span/a/text()").extract_first()
        item["summary"] = text_join(
            response.xpath("//div[@class='article']/p/text()").extract(), "\n")

        info_dict = {}
        try:
            info_dict.update(
                info.split("：", maxsplit=1)
                for info in (text_join(sel.xpath(
                    ".//text()").extract()).replace("\xa0", "")
                             for sel in response.xpath(
                                 "//div[@class='contact-info']/ul/li/span")))
            item["address"] = info_dict.get("地址")
            item["telephone"] = info_dict.get("电话", "").strip("-")
            item["mobile"] = info_dict.get("移动电话", "").strip("-")
        except Exception:
            self.logger.exception("")

        sel_list = response.xpath(
            "//div[@class='company-info']/table/tbody/tr")
        if not sel_list:
            self.notice_change("No data found!!!!! " + response.url)

        for sel in sel_list:
            info = [
                i for i in sel.xpath("td//text()").extract() if text_strip(i)
            ]
            if info:
                try:
                    info_dict[info[0]] = info[1]
                except Exception:
                    pass

        item["name"] = info_dict.get("公司名称") or item["name"]
        item["found_date"] = info_dict.get("注册时间")
        item["registered_capital"] = info_dict.get("注册资本")
        item["employee_scale"] = info_dict.get("公司规模")
        item["legal_person"] = info_dict.get("法定代表人")
        item["main_area"] = info_dict.get("年营业额")
        item["main_products"] = info_dict.get("主营产品")
        item["company_form"] = info_dict.get("企业类型")
        item["address"] = info_dict.get("详细地址") or item.get("address")

        yield item

示例#2

0

显示文件

文件： list_56ye.py 项目： yezimai/crawler-master-v3

    def parse_company(self, response):
        item = CompanyItem()
        item["from_web"] = self.name
        item["from_url"] = response.url
        item["area"] = "shenzhen"

        item["name"] = response.xpath("//div[@class='head']/div/strong/text()").extract_first()
        main_products = response.xpath("//div[@class='head']/div/h4/text()").extract_first("")
        item["main_products"] = main_products.split("：", maxsplit=1)[-1]
        item["summary"] = self.text_join(response.xpath("//table[@cellspacing='3']/tr/td//text()").extract(),
                                         "\n")

        try:
            info_dict = dict(info.split("：", maxsplit=1) for info
                             in response.xpath("//div[@class='qy_body']//li/text()").extract() if "：" in info)
            item["company_form"] = info_dict.get("公司类型")
            item["found_date"] = info_dict.get("成立时间")
            item["employee_scale"] = info_dict.get("公司规模")
            item["registered_capital"] = info_dict.get("注册资本")
            item["address"] = info_dict.get("地址")
            item["mobile"] = info_dict.get("手机")
            item["telephone"] = info_dict.get("电话") or info_dict.get("传真")
        except Exception:
            self.logger.exception("")

        yield item

示例#3

0

显示文件

    def parse_company(self, response):
        item = CompanyItem()
        item["from_web"] = self.name
        item["from_url"] = response.url
        item["area"] = "shenzhen"

        text_join = self.text_join

        item["name"] = response.xpath(
            "//a[contains(@class,'businessName')]/@title").extract_first()
        item["summary"] = self.text_join(
            response.xpath("//div[@class='compIntro']/p/text()").extract(),
            "\n")

        info_dict = {}
        try:
            info_dict.update(
                i.split("：", maxsplit=1)
                for i in (text_join(sel.xpath(".//text()").extract())
                          for sel in response.xpath(
                              "//ul[contains(@class,'basicMsgListo')]/li"))
                if "：" in i)
            item["company_form"] = info_dict.get("公司性质")
            item["employee_scale"] = info_dict.get("公司规模")
            item["legal_person"] = info_dict.get("法人")
            item["industry"] = info_dict.get("公司行业")
            item["address"] = info_dict.get("公司地址", "").replace("查看地图", "")
        except Exception:
            self.logger.exception("")

        yield item

示例#4

0

显示文件

文件： list_youboy.py 项目： yezimai/crawler-master-v3

    def parse_company_name(self, response):
        spider_name = self.name
        response_url = response.url

        urljoin = response.urljoin
        name_exists_func = self.is_search_name_exists
        record_name_func = self.record_search_name
        parse_company = self.parse_company
        text_strip = self.text_strip

        sel_list = response.xpath("//li[@class='dqscontit']/a")
        if not sel_list:
            self.notice_change("No data found!!!!! " + response_url)

        for sel in sel_list:
            try:
                name = text_strip(sel.xpath("text()").extract_first())
            except Exception:
                continue
            if name_exists_func(name):
                continue
            record_name_func(name)

            item = CompanyItem()
            item["from_web"] = spider_name
            item["area"] = "guangdong"
            item["name"] = name

            url = sel.xpath("@href").extract_first("")
            url = urljoin(url)
            request = Request(url, callback=parse_company)
            request.meta["item"] = item
            yield request

示例#5

0

显示文件

文件： list_8671.py 项目： yezimai/crawler-master-v3

    def parse_company_name(self, response):
        spider_name = self.name
        response_url = response.url

        name_exists_func = self.is_search_name_exists
        record_name_func = self.record_search_name
        text_strip = self.text_strip

        sel_list = response.xpath("//td[@class='tItem']")
        if not sel_list:
            self.notice_change("No data found!!!!! " + response_url)

        for sel in sel_list:
            try:
                name = text_strip(sel.xpath("a/text()").extract_first())
            except Exception:
                continue
            if name_exists_func(name):
                continue
            record_name_func(name)

            item = CompanyItem()
            item["from_web"] = spider_name
            item["from_url"] = response_url
            item["area"] = "guangdong"
            item["name"] = name
            infos = sel.xpath(".//text()").extract()
            item["address"] = infos[-1] if infos else None
            yield item

        url = response.xpath("//a/b[text()='下一页']/../@href").extract_first()
        if url:
            yield Request(url, self.parse_company_name, dont_filter=True)

示例#6

0

显示文件

    def parse_company_name(self, response):
        try:
            text = response.text

            if '"state":"ok"' in text:  # 成功
                spider_name = self.name
                name_exists_func = self.is_search_name_exists
                record_name_func = self.record_search_name
                datas = json_loads(text)["data"]
                if "items" in datas:
                    for data in datas["items"]:
                        name = data["name"]
                        if not name:
                            continue

                        if name_exists_func(name):
                            continue
                        record_name_func(name)

                        item = CompanyItem()
                        item["from_web"] = spider_name
                        item[
                            "from_url"] = "http://www.tianyancha.com/company/" + data[
                                "id"]
                        item["area"] = "shenzhen"
                        item["name"] = name
                        yield item
            else:
                self.logger.warning("天眼查---查找相关公司失败，URL(%s)" % response.url)
        except Exception:
            self.logger.exception("天眼查---查找相关公司异常，URL(%s)" % response.url)

示例#7

0

显示文件

    def parse_company(self, response):
        item = CompanyItem()
        item["from_web"] = self.name
        item["from_url"] = response.url
        item["area"] = "guangdong"

        text_join = self.text_join

        item["name"] = response.xpath("//h1[@class='cd_title']/text()").extract_first()
        item["mobile"] = response.xpath("//span[@class='cd_mob']/text()").extract_first()
        item["telephone"] = response.xpath("//span[@class='cd_tel']/text()").extract_first("").strip("-")
        item["main_products"] = text_join(response.xpath("//span[@class='cd_major_item']/a/text()").extract(),
                                          ",")
        item["summary"] = response.xpath("//div[@class='cl_about']/text()").extract_first()

        try:
            info_dict = dict(info.split("：", maxsplit=1) for info
                             in response.xpath("//div[contains(@class,'cd_param')]//span/text()").extract())
            item["name"] = info_dict.get("公司名称") or item["name"]
            item["legal_person"] = info_dict.get("法人代表")
            item["address"] = info_dict.get("公司地址")
            item["company_form"] = info_dict.get("公司类型")
            item["registered_capital"] = info_dict.get("注册资本", "").rstrip("万元")
            item["found_date"] = info_dict.get("成立时间")
            item["employee_scale"] = info_dict.get("员工人数")
            item["annual_turnover"] = info_dict.get("年营业额")
            item["annual_export_volume"] = info_dict.get("年出口额")
            item["main_area"] = info_dict.get("主要销售区域")
        except Exception:
            self.logger.exception("")

        yield item

示例#8

0

显示文件

文件： list_ynshangji.py 项目： yezimai/crawler-master-v3

    def parse_company(self, response):
        item = CompanyItem()
        item["from_web"] = self.name
        item["from_url"] = response.url
        item["area"] = "shenzhen"

        text_join = self.text_join

        item["name"] = response.xpath(
            "//div[@class='coInfos']/h1/text()").extract_first()
        item["summary"] = response.xpath(
            "//div[@class='coInfos']/div[@id='ciTxt']/text()").extract_first()

        try:
            info_dict = dict((info[0], text_join(info[1:])) for info in (
                sel.xpath(".//text()").extract()
                for sel in response.xpath("//div[@class='aiMain']/ul/li")))
            item["name"] = info_dict.get("公司名称") or item["name"]
            item["address"] = info_dict.get("公司地址")
            item["main_products"] = info_dict.get("主营业务")
            item["mobile"] = info_dict.get("联系手机")
            item["telephone"] = info_dict.get("联系电话")
        except Exception:
            self.logger.exception("")

        yield item

示例#9

0

显示文件

    def parse_company_contact(self, response):
        item = CompanyItem()
        item["from_web"] = self.name
        item["from_url"] = response.url
        item["area"] = "shenzhen"

        text_join = self.text_join

        item["name"] = response.xpath(
            "//div[@id='namelogo']/p/text()").extract_first()

        try:
            info_dict = dict(
                info.split("：", maxsplit=1) for info in (
                    text_join(sel.xpath(".//text()").extract()).replace(
                        "\xa0", "")
                    for sel in response.xpath("//div[@id='contact']/ul/li")))
            item["name"] = info_dict.get("公司") or item["name"]
            item["address"] = info_dict.get("地址")
            item["telephone"] = info_dict.get(
                "电话", "").strip("-") or info_dict.get("传真", "").strip("-")
            item["mobile"] = info_dict.get("手机")
        except Exception:
            self.logger.exception("")

        request = Request(
            response.url.replace("/companycontact.htm", "/companyabout.htm"),
            self.parse_company_introduce)
        request.meta["item"] = item
        yield request

示例#10

0

显示文件

    def parse_company(self, response):
        item = CompanyItem()
        item["from_web"] = self.name
        item["from_url"] = response.url
        item["area"] = "shenzhen"

        text_strip = self.text_strip
        text_join = self.text_join

        item["name"] = response.xpath(
            "//h1[@class='company_name']/text()").extract_first()
        item["summary"] = response.xpath(
            "//div[@class='qynr']/p/text()").extract_first()

        info_dict = {}
        try:
            info_dict.update(
                info.split("：", maxsplit=1)
                for info in (text_join(
                    sel.xpath(
                        "*[starts-with(@class,'xg_cd')]/text()").extract())
                             for sel in response.xpath(
                                 "//div[contains(@class,'dpbj')]/ul/li/dl"))
                if "资料不详" not in info)
            item["main_products"] = info_dict.get("主营产品")
        except Exception:
            self.logger.exception("")

        try:
            info_dict.update(
                info.split("：", maxsplit=1)
                for info in (text_join(sel.xpath("*/text()").extract())
                             for sel in response.xpath(
                                 "//ul[contains(@class,'jtxx')]/li/dl"))
                if "资料不详" not in info)
            item["mobile"] = info_dict.get("手机")
            item["telephone"] = info_dict.get("电话")
            item["address"] = info_dict.get("地址")
        except Exception:
            self.logger.exception("")

        for tr in response.xpath("//table[contains(@class,'xxtb')]/tbody/tr"):
            k1 = tr.xpath("td[1]/text()").extract_first("")
            v1 = tr.xpath("td[2]/text()").extract_first("")
            k2 = tr.xpath("td[3]/text()").extract_first("")
            v2 = tr.xpath("td[4]/text()").extract_first("")
            if v1 and '资料不详' not in v1:
                info_dict[text_strip(k1)] = v1
            if v2 and '资料不详' not in v2:
                info_dict[text_strip(k2)] = v2

        item["company_form"] = info_dict.get("企业类型")
        item["registered_capital"] = info_dict.get("注册资本")
        item["legal_person"] = info_dict.get("法定代表人/负责人")
        item["annual_turnover"] = info_dict.get("年营业额")
        item["employee_scale"] = info_dict.get("员工人数")

        yield item

示例#11

0

显示文件

文件： list_99114.py 项目： yezimai/crawler-master-v3

    def parse_company(self, response):
        item = CompanyItem()
        item["from_web"] = self.name
        item["from_url"] = response.url
        item["area"] = "shenzhen"

        name = response.xpath(
            "//p[@class='companyname']/span/text()").extract_first()
        if not name:
            return

        item["name"] = name
        text_join = self.text_join
        info_dict = {}
        try:
            info_dict.update(
                info.split("：", maxsplit=1) for info in (
                    text_join(sel.xpath(".//text()").extract())
                    for sel in response.xpath("//div[@class='comBorder']//p"))
                if "：" in info and "暂未填写" not in info)
            item["main_products"] = info_dict.get("主营业务")
        except Exception:
            self.logger.exception("")

        try:
            info_dict.update(
                info.split("：", maxsplit=1)
                for info in (text_join(sel.xpath(
                    ".//text()").extract()).replace("\xa0", "")
                             for sel in response.xpath(
                                 "//li[contains(@class,'addIntro')]"))
                if "：" in info and "暂未填写" not in info)
            item["address"] = info_dict.get("地址")
        except Exception:
            self.logger.exception("")

        try:
            info_dict.update(
                info.split("：", maxsplit=1) for info in (
                    text_join(sel.xpath(".//text()").extract())
                    for sel in response.xpath("//div[@class='companytxt']/p"))
                if "暂未填写" not in info)
            item["company_form"] = info_dict.get("企业类型")
            item["registered_capital"] = info_dict.get("注册资本")
            item["legal_person"] = info_dict.get("法定代表人")
            item["main_products"] = info_dict.get(
                "主要供应产品") or item["main_products"]
            item["main_area"] = info_dict.get("主要面向地区")
            item["employee_scale"] = info_dict.get("员工数量")
            item["annual_turnover"] = info_dict.get("年营业额")
        except Exception:
            self.logger.exception("")

        yield item

示例#12

0

显示文件

    def parse(self, response):
        spider_name = self.name
        urljoin = response.urljoin
        name_exists_func = self.is_search_name_exists
        record_name_func = self.record_search_name
        parse_company_contact = self.parse_company_contact
        text_strip = self.text_strip
        text_join = self.text_join

        sel_list = response.xpath("//ul[@class='companyList']/li")
        if not sel_list:
            self.notice_change("No data found!!!!! " + response.url)

        for li in sel_list:
            name_a = li.xpath("div[@class='tit']/strong/a")
            try:
                name = text_strip(name_a.xpath("text()").extract_first())
            except Exception:
                continue
            if name_exists_func(name):
                continue
            record_name_func(name)

            item = CompanyItem()
            item["from_web"] = spider_name
            item["area"] = "guangdong"
            item["name"] = name

            try:
                info_dict = dict(
                    info.split("：", maxsplit=1)
                    for info in (text_join(sel.xpath(".//text()").extract())
                                 for sel in li.xpath("dl[1]/dd")))
                item["main_products"] = info_dict.get("主营产品")
                item["address"] = info_dict.get("企业地址")
            except Exception:
                self.logger.exception("")

            url = name_a.xpath("@href").extract_first("")
            url = urljoin(url) + "-contact"
            request = Request(url, callback=parse_company_contact)
            request.meta["item"] = item
            yield request

        url = response.xpath(
            "//div[@class='matpages']/a[text()='下一页']/@href").extract_first()
        if url:
            yield Request(url, self.parse, dont_filter=True)
        else:
            yield Request(self.start_urls[0], self.parse, dont_filter=True)

示例#13

0

显示文件

    def parse_company(self, response):
        item = CompanyItem()
        item["from_web"] = self.name
        item["from_url"] = response.url
        item["area"] = "shenzhen"

        text_join = self.text_join

        item["name"] = response.xpath("//h1/text()").extract_first()
        item["address"] = text_join(response.xpath("//p[@class='fp']/text()").extract(), " ")
        item["summary"] = text_join(response.xpath("//div[@class='con_msg']//p/text()").extract(), "\n")
        try:
            item["company_form"], item["employee_scale"], item["industry"] \
                = response.xpath("//p[@class='ltype']/text()").extract_first("").split("|")
        except ValueError:
            item["company_form"], item["employee_scale"], item["industry"] = "", "", ""

        yield item

示例#14

0

显示文件

文件： list_cnlinfo.py 项目： yezimai/crawler-master-v3

    def parse_shop(self, response):
        text = response.text
        if "过于频繁" in text or "<p>验证码:<input" in text:
            return self._too_often_handler(response)

        item = CompanyItem()
        item["from_web"] = self.name
        item["from_url"] = response.url
        item["area"] = "shenzhen"

        text_join = self.text_join
        try:
            item["summary"] = text_join(
                response.xpath("//label[@id='ctl00_lab_com_Content']"
                               "//text()").extract())

            info_dict = dict(
                info.split("：", maxsplit=1)
                for info in (text_join(sel.xpath(".//text()").extract())
                             for sel in response.xpath(
                                 "//div[contains(@class,'com_files')]/ul/li")))

            item["name"] = self.text_strip(
                info_dict.get("公司简称") or response.xpath(
                    "//h1[@class='com_n']/text()[1]").extract_first(""))
            item["registered_capital"] = info_dict.get("注册资金")
            item["found_date"] = info_dict.get("建立时间")
            item["main_products"] = info_dict.get("主营产品")
            item["employee_scale"] = info_dict.get("员工人数")
            item["company_form"] = info_dict.get("经营模式")

            info_dict.update(
                info.split("：", maxsplit=1) for info in (
                    text_join(sel.xpath(".//text()").extract())
                    for sel in response.xpath("//ul[@class='c_l_contact']/li"))
                if "：" in info)

            item["address"] = info_dict.get("公司地址")
            item["telephone"] = info_dict.get("联系电话")
            item["mobile"] = info_dict.get("移动电话")
        except Exception:
            self.logger.exception("")

        yield item

示例#15

0

显示文件

文件： list_qy6.py 项目： yezimai/crawler-master-v3

    def parse_company_name(self, response):
        urljoin = response.urljoin
        name_exists_func = self.is_search_name_exists
        record_name_func = self.record_search_name
        parse_company = self.parse_company
        text_strip = self.text_strip

        sel_list = response.xpath("//td[@class='f3']/a[1]")
        if not sel_list:
            self.notice_change("No data found!!!!! " + response.url)

        for sel in sel_list:
            try:
                name = text_strip(sel.xpath("strong/text()").extract_first())
            except Exception:
                continue
            if name_exists_func(name):
                continue
            record_name_func(name)

            url = sel.xpath("@href").extract_first("")
            url = urljoin(url)

            item = CompanyItem()
            item["from_web"] = self.name
            item["area"] = "shenzhen"
            item["name"] = name

            request = Request(url, callback=parse_company)
            request.meta["item"] = item
            yield request

        url = response.xpath("//a[text()='下一页']/@href").extract_first()
        if url:
            form_data = parse_qs(response.request.body.decode())
            form_data = {k: v[0] for k, v in form_data.items()}
            form_data["page_change"] = "100"
            form_data["page_num"] = str(int(form_data.get("page_num", 1)) + 1)
            yield FormRequest(response.url,
                              self.parse_company_name,
                              dont_filter=True,
                              formdata=form_data)

示例#16

0

显示文件

    def parse_company(self, response):
        item = CompanyItem()
        item["from_web"] = self.name
        item["from_url"] = response.url
        item["area"] = "shenzhen"

        item["name"] = response.xpath("//h1/text()").extract_first()
        item["company_form"] = response.xpath(
            "//span[text()='公司性质：']/../../td[2]/span/text()").extract_first()
        item["employee_scale"] = response.xpath(
            "//span[text()='公司规模：']/../../td[2]/span/text()").extract_first()
        item["industry"] = response.xpath(
            "//span[text()='公司行业：']/../../td[2]/span/text()").extract_first()
        item["address"] = response.xpath(
            "//span[text()='公司地址：']/../../td[2]/span/text()").extract_first()
        item["summary"] = self.text_join(
            response.xpath(
                "//div[@class='company-content']//text()").extract())

        yield item

示例#17

0

显示文件

    def parse(self, response):
        spider_name = self.name
        response_url = response.url

        name_exists_func = self.is_search_name_exists
        record_name_func = self.record_search_name
        parse_company = self.parse_company
        text_strip = self.text_strip

        sel_list = response.xpath("//div[@class='jobs-list-box']/div/a")
        if not sel_list:
            self.notice_change("No data found!!!!! " + response_url)

        for sel in sel_list:
            try:
                name = text_strip(sel.xpath("text()").extract_first())
            except Exception:
                continue
            if name_exists_func(name):
                continue
            record_name_func(name)

            url = sel.xpath("@href").extract_first("")
            if "//special.zhaopin.com/" in url:
                item = CompanyItem()
                item["from_web"] = spider_name
                item["from_url"] = response_url
                item["area"] = "shenzhen"
                item["name"] = name
                yield item
            else:
                yield Request(url, callback=parse_company)

        url = response.xpath(
            "//div[contains(@class,'pageBar')]/span/a[@title='下一页']/@href"
        ).extract_first()
        if url:
            url = response.urljoin(url)
            yield Request(url, self.parse, dont_filter=True)
        else:
            yield Request(self.start_urls[0], self.parse, dont_filter=True)

示例#18

0

显示文件

文件： list_88152.py 项目： yezimai/crawler-master-v3

    def parse_company(self, response):
        item = CompanyItem()
        item["from_web"] = self.name
        item["from_url"] = response.url
        item["area"] = "shenzhen"

        text_join = self.text_join

        name = response.xpath(
            "//div[@class='companyname']/h1/text()").extract_first()
        if name:  # 模板1
            item["name"] = name
            item["summary"] = text_join(
                (text_join(sel.xpath(".//text()").extract())
                 for sel in response.xpath(
                     "//div[contains(@class,'shopcontent')]/p")), "\n")

            try:
                info_dict = dict(
                    info.split("：", maxsplit=1) for info in (
                        text_join(sel.xpath(".//text()").extract())
                        for sel in response.xpath(
                            "//div[contains(@class,'contact')]/ul/li")))
                item["address"] = info_dict.get("公司地址")
                item["telephone"] = info_dict.get("公司传真")
            except Exception:
                self.logger.exception("")

            yield item
        else:  # 模板2
            name = response.xpath(
                "//div[@id='companyname']/h1/a/text()").extract_first()
            if name:
                item["name"] = name
                request = Request(response.url.replace("/shop/", "/contact/"),
                                  self.parse_company_contact)
                request.meta["item"] = item
                yield request
            else:
                self.logger.error("Unknown template: " + response.url)
                return

示例#19

0

显示文件

    def parse(self, response):
        spider_name = self.name
        response_url = response.url

        name_exists_func = self.is_search_name_exists
        record_name_func = self.record_search_name
        parse_company = self.parse_company
        text_strip = self.text_strip

        sel_list = response.xpath("//div[@class='com-list-2']/table/tr/td/a")
        if not sel_list:
            self.notice_change("No data found!!!!! " + response_url)

        for sel in sel_list:
            try:
                name = text_strip(sel.xpath("@title").extract_first())
            except Exception:
                continue
            if name_exists_func(name):
                continue
            record_name_func(name)

            item = CompanyItem()
            item["from_web"] = spider_name
            item["area"] = "shenzhen"
            item["name"] = name

            url = sel.xpath("@href").extract_first("")
            request = Request(url, callback=parse_company)
            request.meta["item"] = item
            yield request

        url = response.xpath(
            "//ul[contains(@class,'pageLink')]//a[@class='next']/@href"
        ).extract_first()
        if url:
            url = response.urljoin(url)
            yield Request(url, self.parse, dont_filter=True)
        else:
            yield Request(self.start_urls[0], self.parse, dont_filter=True)

示例#20

0

显示文件

文件： list_huangye88.py 项目： yezimai/crawler-master-v3

    def parse_company_name(self, response):
        spider_name = self.name
        response_url = response.url

        name_exists_func = self.is_search_name_exists
        record_name_func = self.record_search_name
        parse_company = self.parse_company
        text_strip = self.text_strip

        sel_list = response.xpath("//form[@id='jubao']/dl/dt/h4/a")
        if not sel_list:
            self.notice_change("No data found!!!!! " + response_url)

        for sel in sel_list:
            try:
                name = text_strip(sel.xpath("text()").extract_first())
            except Exception:
                continue

            if name_exists_func(name):
                continue
            record_name_func(name)

            item = CompanyItem()
            item["from_web"] = spider_name
            item["area"] = "shenzhen"
            item["name"] = name

            url = sel.xpath("@href").extract_first("")
            request = Request(url + "company_detail.html",
                              callback=parse_company)
            request.meta["item"] = item
            yield request

        url = response.xpath(
            "//div[contains(@class,'page_tag')]/a[text()='下一页']/@href"
        ).extract_first()
        if url:
            yield Request(url, self.parse_company_name, dont_filter=True)

示例#21

0

显示文件

文件： list_cnlinfo.py 项目： yezimai/crawler-master-v3

    def parse_company(self, response):
        text = response.text
        if "过于频繁" in text or "<p>验证码:<input" in text:
            return self._too_often_handler(response)

        item = CompanyItem()
        item["from_web"] = self.name
        item["from_url"] = response.url
        item["area"] = "shenzhen"

        item["name"] = self.text_strip(
            response.xpath(
                "//div[@class='headcont']//h1/text()").extract_first())

        text_join = self.text_join
        try:
            item["summary"] = text_join(
                response.xpath("//div[@class='hyinfo_detail_txt_files']"
                               "//text()").extract())

            info_dict = dict(
                info.split("：", maxsplit=1)
                for info in (text_join(sel.xpath(".//text()").extract())
                             for sel in response.xpath(
                                 "//li[@class='hyinfo_d_job_list_li']")))
            item["found_date"] = info_dict.get("成立日期")
            item["main_products"] = info_dict.get("主营产品")
            item["address"] = info_dict.get("公司注册地址")
            item["telephone"] = info_dict.get("电话") or info_dict.get("传真")
            item["mobile"] = info_dict.get("业务经理手机")
            item["registered_capital"] = info_dict.get("注册资金")
            item["employee_scale"] = info_dict.get("员工数量")
            item["legal_person"] = info_dict.get("法人")
            item["company_form"] = info_dict.get("公司类型")
        except Exception:
            self.logger.exception("")

        yield item

示例#22

0

显示文件

文件： list_soudh.py 项目： yezimai/crawler-master-v3

    def parse(self, response):
        spider_name = self.name
        response_url = response.url

        name_exists_func = self.is_search_name_exists
        record_name_func = self.record_search_name
        text_strip = self.text_strip

        sel_list = response.xpath("//nobr/a/text()").extract()
        if not sel_list:
            self.notice_change("No data found!!!!! " + response_url)

        for name in sel_list:
            name = text_strip(name)
            if name_exists_func(name):
                continue
            record_name_func(name)

            item = CompanyItem()
            item["from_web"] = spider_name
            item["from_url"] = response_url
            item["area"] = "shenzhen"
            item["name"] = name
            yield item

示例#23

0

显示文件

    def parse(self, response):
        spider_name = self.name
        response_url = response.url

        name_exists_func = self.is_search_name_exists
        record_name_func = self.record_search_name
        text_strip = self.text_strip

        sel_list = response.xpath("//div[contains(@class,'itemlist')]/h2/a")
        if not sel_list:
            self.notice_change("No data found!!!!! " + response_url)

        for sel in sel_list:
            try:
                name = text_strip(sel.xpath("text()").extract_first())
            except Exception:
                continue
            if len(name) > 50:
                continue
            if name_exists_func(name):
                continue
            record_name_func(name)

            item = CompanyItem()
            item["from_web"] = spider_name
            item["from_url"] = response_url
            item["area"] = "shenzhen"
            item["name"] = name
            yield item

        url = response.xpath(
            "//div[@class='pagelist']/a[text()='后页']/@href").extract_first()
        if url:
            yield Request(url, self.parse, dont_filter=True)
        else:
            yield Request(self.start_urls[0], self.parse, dont_filter=True)