Exemplos de CleanWords em Python, exemplos de BigB2BSpider.data_tools.clean_worlds.CleanWords em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: bendiso.py Projeto: cogitozz/Big_Scrapy_Spider

class BenDiSouWangSpider(CrawlSpider):
    name = "bendiso"
    allowed_domains = ['www.bendiso.com', 'bendiso.com']
    start_urls = ['http://www.bendiso.com/gongsi/']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.5,
        'ITEM_PIPELINES': {
            'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302
        },
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420,
        },
    }

    rules = (
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@id='main']//ul//li//div[@class='picture']//a")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=("//div[@class='page']//a[@title='下一页']")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@class='nav']//a[contains(text(),'联系方式')]")),
             callback='parse_items',
             follow=True),
    )

    def parse_items(self, response):
        item = BenDiSouWangItem()
        pattern = re.compile(r'<meta name="keywords" content=".*?,(.*?)"/>',
                             re.S)
        item["company_Name"] = response.xpath(
            "//div[@id='header']//h1//a/text()").extract_first()
        item["company_address"] = response.xpath(
            "//div[@class='corp_boxinfo']//p[contains(text(),'地址：')]/text()"
        ).extract_first()
        item["linkman"] = response.xpath(
            "//p[contains(text(),'联系人：')]/text()").extract_first()
        item["telephone"] = response.xpath(
            "//p[contains(text(),'电话：')]/text()").extract_first()
        item["phone"] = response.xpath(
            "//p[contains(text(),'手机：')]/text()").extract_first()
        item["contact_Fax"] = response.xpath(
            "//p[contains(text(),'传真：')]/text()").extract_first()
        item["contact_QQ"] = response.xpath(
            "//p[contains(text(),'QQ：')]//img[@alt='点击这里给我发消息']/../@href"
        ).extract_first()
        item["E_Mail"] = response.xpath(
            "//p[contains(text(),'邮箱：')]/a/text()").extract_first()
        item["Source"] = response.url
        item["kind"] = ",".join(
            response.xpath(
                "//ul[@class='product_boxli']//li//div[@class='info']//a/@title"
            ).getall())
        city_infos = response.xpath(
            "//div[@class='corp_boxinfo']//p[contains(text(),'地址：')]/text()"
        ).get()

        if item["company_Name"] and item["company_Name"] != '':
            if "（" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('（')[0]
            elif "(" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('(')[0]
            elif "_" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('_')[0]
            elif "-" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('-')[0]
            else:
                item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：', '',
                                              item["company_Name"]).replace(
                                                  ' ', '').strip()
        else:
            return
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"]:
            item["kind"] = item["kind"].replace(" ", '|')
            item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营|主营项目：|', '', item["kind"]).replace('-', '|')\
                .replace('、', '|').replace(',', '|').replace('，', '|').replace(';','|').replace('.', '').strip()
        else:
            item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

        if item["linkman"]:
            item["linkman"] = item["linkman"].replace('未填写', '')
        else:
            item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"]:
            item["phone"] = self.cw.search_phone_num(item["phone"])
        else:
            item["phone"] = ''

        if item["telephone"]:
            item["telephone"] = self.cw.search_telephone_num(item["telephone"])
        else:
            item["telephone"] = ''

        if item["contact_Fax"]:
            item["contact_Fax"] = self.cw.search_contact_Fax(
                item["contact_Fax"])
        else:
            item["contact_Fax"] = ''

        if item["E_Mail"]:
            item["E_Mail"] = self.cw.search_email(item["E_Mail"])
        else:
            item["E_Mail"] = ''

        if item["contact_QQ"]:
            item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
        else:
            item["contact_QQ"] = ''

        if item["company_address"]:
            item["company_address"] = self.cw.search_address(
                item["company_address"])
        else:
            item["company_address"] = ''

        if city_infos:
            if ' ' in city_infos:
                try:
                    city_infos = city_infos.replace("地址：", "")
                    item["province"] = city_infos.split(' ')[0]
                    item["city_name"] = city_infos.split(' ')[1]
                except:
                    item["province"] = ''
                    item["city_name"] = ''
            else:
                item["province"] = ''
                item["city_name"] = ''
        else:
            # pattern_p = re.compile(r'([\u4e00-\u9fa5]{2,5})省')
            # pattern_c = re.compile(r'[省]([\u4e00-\u9fa5]{2,5})市')

            item["province"] = ''
            item["city_name"] = ''

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 2

0

Exibir arquivo

class QiMaoWangSpider(CrawlSpider):
    name = 'jzjxqm'
    allowed_domains = ['www.jzjxqm.com']
    start_urls = ['http://www.jzjxqm.com/qiye/']
    cw = CleanWords()

    custom_settings = {
        'DOWNLOAD_DELAY': 0.3,
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control":
            "max-age=0",
            "Connection":
            "keep-alive",
            # "Cookie":"Hm_lvt_39b391b010992cf89654d83467db5db7=1564969344; Hm_lpvt_39b391b010992cf89654d83467db5db7=1564970833",
            # "Host":"www.mfqyw.com",
            # "Referer":"http://www.mfqyw.com/",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543,
        }
    }

    def parse(self, response):
        a_list = response.xpath(
            "//div[@class='m']//div[@class='left_box']//div[@class='catalog']//td//a"
        )
        for a in a_list:
            kind_name = a.xpath("./text()").extract_first()
            kind_href = a.xpath("./@href").extract_first()
            if kind_name is None:
                kind_name = a.xpath("./strong/text()").extract_first()
            if kind_href:
                # print(kind_name,kind_href)
                yield scrapy.Request(url=kind_href,
                                     callback=self.parse_company_list,
                                     dont_filter=True)

    def parse_company_list(self, response):
        tr_list = response.xpath(
            "//div[@class='left_box']//div[@class='list']//table//tr")
        for tr in tr_list:
            item = QiMaoWangspiderItem()
            pattern = re.compile(r'\[(.*?)\/(.*?)\]', re.S)
            item["company_Name"] = tr.xpath(
                ".//li//a/strong/text()").extract_first()
            company_href = tr.xpath(".//li/a/@href").extract_first()
            item["kind"] = tr.xpath(
                ".//li[contains(text(),'主营：')]/text()").extract_first()
            city_infos = tr.xpath(
                ".//td[@class='f_orange']/text()").extract_first()
            if city_infos:
                # 广东/潮州市
                try:
                    item["province"] = re.findall(pattern, city_infos)[0][0]
                    item["city_name"] = re.findall(pattern, city_infos)[0][1]
                except:
                    item["province"] = ''
                    item["city_name"] = ''
            else:
                item["province"] = city_infos
                item["city_name"] = ''
            if company_href:
                # print(company_href)
                contact_href = company_href + "contact/"
                yield scrapy.Request(url=contact_href,
                                     callback=self.parse_company_contact,
                                     meta={"item": item},
                                     dont_filter=True)

        next_page_url = response.xpath(
            "//div[@class='pages']//a[contains(text(),'下一页»')]/@href"
        ).extract_first()
        if next_page_url:
            yield scrapy.Request(url=next_page_url,
                                 callback=self.parse_company_list)

    def parse_company_contact(self, response):
        headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Connection":
            "keep-alive",
            "Cookie":
            "bdshare_firstime=1564976283795; BAIDU_SSP_lcr=https://www.baidu.com/link?url=6kZ0hzYVwwzEyL9fwlHs-4qX3qJG3iRU1NoWkSz4Thu&wd=&eqid=ae0f4b8300245afa000000065d50c031; Hm_lvt_c7894de9c1e0658a1d1ab0f838038a41=1565573377; UM_distinctid=16c8371ec762f7-08654ebee1a5d2-5a13331d-1fa400-16c8371ec77453; CNZZDATA4515303=cnzz_eid%3D439230478-1565569508-%26ntime%3D1565569508; Hm_lpvt_c7894de9c1e0658a1d1ab0f838038a41=1565573634",
            # "Host": "www.jzjxqm.com",
            "Referer":
            response.url,
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        }

        item = response.meta["item"]
        item["company_Name"] = response.xpath(
            "//td[contains(text(),'公司名称：')]/following-sibling::td/text()"
        ).extract_first()
        item["company_address"] = "".join(
            response.xpath(
                "//td[contains(text(),'公司地址：')]/following-sibling::td/text()").
            extract())
        item["linkman"] = response.xpath(
            "//td[contains(text(),'联 系 人：')]/following-sibling::td/text()"
        ).extract_first()
        item["telephone"] = response.xpath(
            "//td[contains(text(),'公司电话：')]/following-sibling::td/img/@src"
        ).extract_first()
        item["phone"] = response.xpath(
            "//td[contains(text(),'手机号码：')]/following-sibling::td/img/@src"
        ).extract_first()
        item["contact_Fax"] = response.xpath(
            "//td[contains(text(),'公司传真：')]/following-sibling::td/img/@src"
        ).extract_first()
        item["contact_QQ"] = response.xpath(
            "//img[@title='点击QQ交谈/留言']/../@href").extract_first()
        item["E_Mail"] = response.xpath(
            "//td[contains(text(),'电子邮件：')]/following-sibling::td/img/@src"
        ).extract_first()
        item["Source"] = response.url

        if item["company_Name"]:
            item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：', '',
                                          item["company_Name"]).replace(
                                              ' ', '').strip()
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"]:
            item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \
                .replace('，', '|').replace('，', '|').replace('.', '').strip()
        else:
            item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

        if item["linkman"]:
            item["linkman"] = item["linkman"]
        else:
            item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"]:
            item["phone"] = self.requests_href(item["phone"], headers)
            item["phone"] = self.cw.search_phone_num(item["phone"])
        else:
            item["phone"] = ''

        if item["telephone"]:
            item["telephone"] = self.requests_href(item["telephone"], headers)
            item["telephone"] = self.cw.search_telephone_num(item["telephone"])
        else:
            item["telephone"] = ''

        if item["contact_Fax"]:
            item["contact_Fax"] = self.requests_href(item["contact_Fax"],
                                                     headers)
            item["contact_Fax"] = self.cw.search_contact_Fax(
                item["contact_Fax"])
        else:
            item["contact_Fax"] = ''

        if item["E_Mail"]:
            item["E_Mail"] = self.requests_href(item["E_Mail"], headers)
            item["E_Mail"] = self.cw.search_email(item["E_Mail"])
        else:
            item["E_Mail"] = ''

        if item["contact_QQ"]:
            item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
        else:
            item["contact_QQ"] = ''

        if item["company_address"]:
            item["company_address"] = self.cw.search_address(
                item["company_address"])
        else:
            item["company_address"] = ''

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

    def requests_href(self, url, headers):
        res = requests.get(url=url, headers=headers, timeout=20, verify=False)
        res.encoding = "utf-8"
        if res.status_code == requests.codes.ok:
            img = res.content
            something_img_file_path = r"F:\PythonProjects\venv\pythonProjects\BigB2BSpider\BigB2BSpider\img_src\something_img2\image.png"
            with open(something_img_file_path, "wb") as fp:
                fp.write(img)
            fp.close()
            if img:
                try:
                    something = recognition_image(something_img_file_path)
                    if something:
                        return something
                    else:
                        return ''
                except:
                    return ''
            else:
                return ''
        else:
            return ''

Exemplo n.º 3

0

Exibir arquivo

class ZhongGuoHuaDongHuaGongWangSpider(CrawlSpider):
    name = "nbchem"
    allowed_domains = ['www.nbchem.com','nbchem.com']
    start_urls = ['http://www.nbchem.com']
    cw = CleanWords()
    page_count = 0
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.3,
        'ITEM_PIPELINES': {'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302},
        'DEFAULT_REQUEST_HEADERS': {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420,
        },
    }
    def parse(self, response):
        a_list = response.xpath("//div[@id='p2']//table[@align='center']//td[@valign='top']//div//a")
        for a in a_list:
            kind_name = a.xpath("./text()").get()
            kind_href = a.xpath("./@href").get()
            if kind_href:
                kind_href = "http://www.nbchem.com" + kind_href
                # print(kind_name,kind_href)
                # list2-306.html
                kind_num = kind_href.split("list2-")[-1].split(".html")[0]
                yield scrapy.Request(
                    url=kind_href,
                    callback=self.parse_company_list,
                    meta={"info":kind_num},
                    dont_filter=True
                )

    def parse_company_list(self, response):
        if "没有公司" in response.text:
            return

        pattern = re.compile(r'</span>\)主要产品：(.*?)\s*</div>',re.S)
        kinds = re.findall(pattern,response.text) if re.findall(pattern,response.text) else ''
        td_list = response.xpath("//div[@class='page']//table[3]//td[@valign='top']//div//table[@width='100%']//td")
        for td in td_list:
            item = ZhongGuoHuaDongHuaGongWangItem()
            item["kind"] = "".join(td.xpath(".//div[2]//text()").getall()).strip()
            contact_href = td.xpath(".//div[4]/a/@href").get()
            if contact_href and "dtcon-" in contact_href:
                # http://www.nbchem.com/bc/dtcon-15691.html
                contact_href = "http://www.nbchem.com/bc/" + contact_href
                # print(contact_href)
                yield scrapy.Request(
                    url=contact_href,
                    callback=self.parse_company_contact,
                    meta={"item": item},
                    dont_filter=True
                )
        kind_num = response.meta.get("info")
        if kind_num:
            total_page = response.xpath("//div[@class='nbchem']//span[@id='ctl00_ContentPlaceHolder1_PageNav1_labPage']/text()").get()
            if total_page:
                total_page_num = total_page.split('/')[-1]
                if self.page_count < int(total_page_num):
                    self.page_count += 1
                    next_page_url = "http://www.nbchem.com/bc/list2-{}-0-0-0-{}.html".format(kind_num,self.page_count)
                    print(next_page_url)
                    # http://www.nbchem.com/bc/list2-435-0-0-0-1.html
                    yield scrapy.Request(
                        url=next_page_url,
                        callback=self.parse_company_list
                    )


    def parse_company_contact(self, response):
        item = response.meta["item"]
        if "contact.aspx" in response.url:
            item["company_Name"] = response.xpath(
                "//td[contains(text(),'公司名称：')]/following-sibling::td/strong/text()").extract_first()
            item["company_address"] = response.xpath(
                "//td[contains(text(),'地址：')]/following-sibling::td/text()").extract_first()
            item["linkman"] = response.xpath(
                "//td[contains(text(),'联系人：')]/following-sibling::td/text()").extract_first()
            item["telephone"] = response.xpath(
                "//td[contains(text(),'电话：')]/following-sibling::td/text()").extract_first()
            item["phone"] = response.xpath(
                "//td[contains(text(),'手机：')]/following-sibling::td/text()").extract_first()
            item["contact_Fax"] = response.xpath(
                "//td[contains(text(),'传真：')]/following-sibling::td/text()").extract_first()
            item["contact_QQ"] = response.xpath(
                "//td[contains(text(),'QQ：')]/following-sibling::td/a/text()").extract_first()
            item["E_Mail"] = response.xpath(
                "//td[contains(text(),'Email：')]/following-sibling::td/a/text()").extract_first()
            item["Source"] = response.url
            item["kind"] = item["kind"]
            city_infos = response.xpath("//dt[contains(text(),'所在地区：')]/following-sibling::dd/text()").get()

            if item["company_Name"] and item["company_Name"] != '':
                if "（" in item["company_Name"]:
                    item["company_Name"] = item["company_Name"].split('（')[0]
                elif "(" in item["company_Name"]:
                    item["company_Name"] = item["company_Name"].split('(')[0]
                elif "_" in item["company_Name"]:
                    item["company_Name"] = item["company_Name"].split('_')[0]
                elif "-" in item["company_Name"]:
                    item["company_Name"] = item["company_Name"].split('-')[0]
                else:
                    item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：', '', item["company_Name"]).replace(' ',
                                                                                                          '').strip()
            else:
                return
            item["company_id"] = self.get_md5(item["company_Name"])

            if item["kind"]:
                item["kind"] = item["kind"].replace(" ", '|')
                item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营|主营项目：', '', item["kind"]).replace('-', '|') \
                    .replace('、', '|').replace(',', '|').replace('，', '|').replace(';', '|').replace('.', '').strip()
            else:
                item["kind"] = ''

            item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

            if item["linkman"]:
                item["linkman"] = item["linkman"].replace('未填写', '')
            else:
                item["linkman"] = ''
            item["linkman"] = self.cw.search_linkman(item["linkman"])

            if item["phone"]:
                item["phone"] = self.cw.search_phone_num(item["phone"])
            else:
                item["phone"] = ''

            if item["telephone"]:
                item["telephone"] = self.cw.search_telephone_num(item["telephone"])
            else:
                item["telephone"] = ''

            if item["contact_Fax"]:
                item["contact_Fax"] = self.cw.search_contact_Fax(item["contact_Fax"])
            else:
                item["contact_Fax"] = ''

            if item["E_Mail"]:
                item["E_Mail"] = self.cw.search_email(item["E_Mail"])
            else:
                item["E_Mail"] = ''

            if item["contact_QQ"]:
                item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
            else:
                item["contact_QQ"] = ''

            if item["company_address"]:
                item["company_address"] = self.cw.search_address(item["company_address"])
            else:
                item["company_address"] = ''

            if city_infos:
                if '/' in city_infos:
                    try:
                        item["province"] = city_infos.split('/')[0]
                        item["city_name"] = city_infos.split('/')[1]
                    except:
                        item["province"] = ''
                        item["city_name"] = ''
                else:
                    item["province"] = ''
                    item["city_name"] = ''
            else:
                item["province"] = ''
                item["city_name"] = ''

        else:
            pattern = re.compile(r'<meta name="keywords" content=".*?,(.*?)"/>',re.S)
            item["company_Name"] = response.xpath("//div[@class='ptitle']/span/text()").extract_first()
            item["company_address"] = response.xpath("//td[contains(text(),'地　　址：')]/following-sibling::td/span/text()").extract_first()
            item["linkman"] = response.xpath("//td[contains(text(),'联 系 人：')]/following-sibling::td/span/text()").extract_first()
            item["telephone"] = response.xpath("//td[contains(text(),'电　　话：')]/following-sibling::td/span/text()").extract_first()
            item["phone"] = response.xpath("//td[contains(text(),'移动电话：')]/following-sibling::td/span/text()").extract_first()
            item["contact_Fax"] = response.xpath("//td[contains(text(),'传　　真：')]/following-sibling::td/span/text()").extract_first()
            item["contact_QQ"] = response.xpath("//td[contains(text(),'QQ：')]/following-sibling::td/span/text()").extract_first()
            item["E_Mail"] = response.xpath("//td[contains(text(),'电子邮件：')]/following-sibling::td/span/text()").extract_first()
            item["Source"] = response.url
            item["kind"] = item["kind"]
            city_infos = response.xpath("//dt[contains(text(),'所在地区：')]/following-sibling::dd/text()").get()


            if item["company_Name"] and item["company_Name"] != '':
                if "（" in item["company_Name"]:
                    item["company_Name"] = item["company_Name"].split('（')[0]
                elif "(" in item["company_Name"]:
                    item["company_Name"] = item["company_Name"].split('(')[0]
                elif "_" in item["company_Name"]:
                    item["company_Name"] = item["company_Name"].split('_')[0]
                elif "-" in item["company_Name"]:
                    item["company_Name"] = item["company_Name"].split('-')[0]
                else:
                    item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：', '', item["company_Name"]).replace(' ', '').strip()
            else:
                return
            item["company_id"] = self.get_md5(item["company_Name"])

            if item["kind"]:
                item["kind"] = item["kind"].replace(" ", '|')
                item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营|主营项目：', '', item["kind"]).replace('-', '|')\
                    .replace('、', '|').replace(',', '|').replace('，', '|').replace(';','|').replace('.', '').strip()
            else:
                item["kind"] = ''

            item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

            if item["linkman"]:
                item["linkman"] = item["linkman"].replace('未填写','')
            else:
                item["linkman"] = ''
            item["linkman"] = self.cw.search_linkman(item["linkman"])

            if item["phone"]:
                item["phone"] = self.cw.search_phone_num(item["phone"])
            else:
                item["phone"] = ''

            if item["telephone"]:
                item["telephone"] = self.cw.search_telephone_num(item["telephone"])
            else:
                item["telephone"] = ''

            if item["contact_Fax"]:
                item["contact_Fax"] = self.cw.search_contact_Fax(item["contact_Fax"])
            else:
                item["contact_Fax"] = ''

            if item["E_Mail"]:
                item["E_Mail"] = self.cw.search_email(item["E_Mail"])
            else:
                item["E_Mail"] = ''

            if item["contact_QQ"]:
                item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
            else:
                item["contact_QQ"] = ''

            if item["company_address"]:
                item["company_address"] = self.cw.search_address(item["company_address"])
            else:
                item["company_address"] = ''

            if city_infos:
                if '/' in city_infos:
                    try:
                        item["province"] = city_infos.split('/')[0]
                        item["city_name"] = city_infos.split('/')[1]
                    except:
                        item["province"] = ''
                        item["city_name"] = ''
                else:
                    item["province"] = ''
                    item["city_name"] = ''
            else:
                item["province"] = ''
                item["city_name"] = ''

            yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 4

0

Exibir arquivo

Arquivo: qdw.py Projeto: cogitozz/Big_Scrapy_Spider

class QiDuoWangSpider(CrawlSpider):
    name = "qdw"
    allowed_domains = ['www.qiduowang.com']
    start_urls = ['http://www.qiduowang.com/company/']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.3,
        'ITEM_PIPELINES': {
            'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302
        },
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420,
        },
    }
    # /c3847/p2/
    rules = (
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=("//div[@class='left_box']//table//tr//a")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@class='list']//td[@align='left']//li//a")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@class='pages']//a[contains(text(),'下一页»')]")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@class='menu']//span[contains(text(),'联系方式')]/..")),
             callback='parse_items',
             follow=True),
    )

    def parse_items(self, response):
        headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control":
            "max-age=0",
            "Connection":
            "keep-alive",
            "Cookie":
            "safedog-flow-item=; __51cke__=; __tins__915545=%7B%22sid%22%3A%201567480167459%2C%20%22vd%22%3A%201%2C%20%22expires%22%3A%201567481967459%7D; __51laig__=9",
            "Host":
            "www.qiduowang.com",
            "Referer":
            response.url,
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        }
        item = QiDuoWangItem()
        item["company_Name"] = response.xpath(
            "//td[contains(text(),'公司名称：')]/following-sibling::td/text()"
        ).extract_first()
        item["company_address"] = response.xpath(
            "//td[contains(text(),'公司地址：')]/following-sibling::td/text()"
        ).extract_first()
        item["linkman"] = response.xpath(
            "//td[contains(text(),'联 系 人：')]/following-sibling::td/text()"
        ).extract_first()
        item["telephone"] = response.xpath(
            "//td[contains(text(),'公司电话：')]/following-sibling::td/img/@src"
        ).extract_first()
        item["phone"] = response.xpath(
            "//td[contains(text(),'手机号码：')]/following-sibling::td/img/@src"
        ).extract_first()
        item["contact_Fax"] = response.xpath(
            "//td[contains(text(),'公司传真：')]/following-sibling::td/img/@src"
        ).extract_first()
        item["contact_QQ"] = response.xpath(
            "//img[@title='点击QQ交谈/留言']/../@href").extract_first()
        item["E_Mail"] = response.xpath(
            "//td[contains(text(),'电子邮件：')]/following-sibling::td/img/@src"
        ).extract_first()
        item["Source"] = response.url
        item["kind"] = ",".join(
            response.xpath("//div[@class='head']//h4/text()").getall())
        city_infos = response.xpath(
            "//td[contains(text(),'所在地区：')]/following-sibling::td/text()").get(
            )

        if item["company_Name"]:
            if "（" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('（')[0]
            elif "(" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('(')[0]
            elif "_" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('_')[0]
            elif "-" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('-')[0]
            else:
                item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：', '',
                                              item["company_Name"]).replace(
                                                  ' ', '').strip()
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"]:
            item["kind"] = item["kind"].replace(" ", '|')
            item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营|主营项目：', '', item["kind"]).replace('-', '|')\
                .replace('、', '|').replace(',', '|').replace('，', '|').replace(';','|').replace('.', '').strip()
        else:
            item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

        if item["linkman"]:
            item["linkman"] = item["linkman"]
        else:
            item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"]:
            item["phone"] = self.requests_href(item["phone"], headers)
            item["phone"] = self.cw.search_phone_num(item["phone"])
        else:
            item["phone"] = ''

        if item["telephone"]:
            item["telephone"] = self.requests_href(item["telephone"], headers)
            item["telephone"] = self.cw.search_telephone_num(item["telephone"])
        else:
            item["telephone"] = ''

        if item["contact_Fax"]:
            item["contact_Fax"] = self.requests_href(item["contact_Fax"],
                                                     headers)
            item["contact_Fax"] = self.cw.search_contact_Fax(
                item["contact_Fax"])
        else:
            item["contact_Fax"] = ''

        if item["E_Mail"]:
            item["E_Mail"] = self.requests_href(item["E_Mail"], headers)
            item["E_Mail"] = self.cw.search_email(item["E_Mail"])
        else:
            item["E_Mail"] = ''

        if item["contact_QQ"]:
            item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
        else:
            item["contact_QQ"] = ''

        if item["company_address"]:
            item["company_address"] = self.cw.search_address(
                item["company_address"])
        else:
            item["company_address"] = ''

        if city_infos:
            if '/' in city_infos:
                try:
                    item["province"] = city_infos.split('/')[0]
                    item["city_name"] = city_infos.split('/')[1]
                except:
                    item["province"] = ''
                    item["city_name"] = ''
            else:
                item["province"] = ''
                item["city_name"] = ''
        else:
            item["province"] = ''
            item["city_name"] = ''

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

    def requests_href(self, url, headers):
        res = requests.get(url=url, headers=headers, timeout=10, verify=False)
        res.encoding = "utf-8"
        if res.status_code == requests.codes.ok:
            img = res.content
            something_img_file_path = r"F:\PythonProjects\venv\pythonProjects\BigB2BSpider\BigB2BSpider\img_src\something_img3\image.png"
            with open(something_img_file_path, "wb") as fp:
                fp.write(img)
            fp.close()
            if img:
                try:
                    something = recognition_image(something_img_file_path)
                    if something:
                        return something
                    else:
                        return ''
                except:
                    return ''
            else:
                return ''
        else:
            return ''

Exemplo n.º 5

0

Exibir arquivo

class WuJinShangJiWangSpider(CrawlSpider):
    name = "chinawj"
    allowed_domains = ['www.chinawj.com.cn', 'chinawj.com.cn']
    start_urls = ['http://www.chinawj.com.cn/qiye/']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.5,
        'ITEM_PIPELINES': {
            'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302
        },
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420,
        },
    }
    # /c3847/p2/
    rules = (
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=("//div[@class='proTypeList']//li//a")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=
            ("//div[@class='leftbox']//div[@class='pr0']//div[@class='pr2']//li[1]//a"
             )),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=("//p[@id='page']//a[contains(text(),'下一页')]")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@class='menu']//a[contains(text(),'联系方式')]")),
             callback='parse_items',
             follow=True),
    )

    def parse_items(self, response):
        item = WuJinShangJiWangItem()
        # pattern = re.compile(r'<meta name="keywords" content="(.*?),.*?" >', re.S)
        pattern1 = re.compile(r'联系人：(.*?) <', re.S)
        pattern2 = re.compile(r'>\s*地址：(.*?)<', re.S)
        pattern3 = re.compile(r'>\s*电话：(.*?)<', re.S)
        pattern4 = re.compile(r'>\s*手机：(.*?)<', re.S)
        pattern5 = re.compile(r'>\s*传真：(.*?)<', re.S)
        pattern6 = re.compile(r'>\s*邮箱：(.*?)<', re.S)
        # pattern7 = re.compile(r'<em>QQ：</em>(.*?)<br />', re.S)
        pattern8 = re.compile(r'<li>所在地区：(.*?)</li>', re.S)
        pattern9 = re.compile(r'>主营：(.*?)<', re.S)
        item["company_Name"] = response.xpath(
            "//div[@class='head']//h1/text()").extract_first()
        item["company_address"] = "".join(
            re.findall(pattern2, response.text)) if re.findall(
                pattern2, response.text) else ''
        item["linkman"] = "".join(
            re.findall(pattern1, response.text)) if re.findall(
                pattern1, response.text) else ''
        item["telephone"] = "".join(
            re.findall(pattern3, response.text)[0]) if re.findall(
                pattern3, response.text) else ''
        item["phone"] = "".join(re.findall(pattern4,
                                           response.text)[0]) if re.findall(
                                               pattern4, response.text) else ''
        item["contact_Fax"] = "".join(
            re.findall(pattern5, response.text)[0]) if re.findall(
                pattern5, response.text) else ''
        item["contact_QQ"] = response.xpath(
            "//img[@alt='跟我QQ洽谈']/../@href").get()
        item["E_Mail"] = "".join(
            re.findall(pattern6, response.text)) if re.findall(
                pattern6, response.text) else ''
        item["Source"] = response.url
        item["kind"] = "|".join(
            response.xpath("//div[@class='head']//h4//a//text()").extract())
        city_infos = ",".join(re.findall(pattern8,
                                         response.text)) if re.findall(
                                             pattern8, response.text) else ''

        if item["company_Name"] and item["company_Name"] != '':
            if "（" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('（')[0]
            elif "(" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('(')[0]
            elif "_" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('_')[0]
            elif "-" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('-')[0]
            else:
                item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：', '',
                                              item["company_Name"]).replace(
                                                  ' ', '').strip()
        else:
            return
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"]:
            item["kind"] = item["kind"].replace(" ", '|')
            item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营|主营项目：', '', item["kind"]).replace('-', '|')\
                .replace('、', '|').replace(',', '|').replace('，', '|').replace(';','|').replace('.', '').strip()
        else:
            item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

        if item["linkman"]:
            item["linkman"] = item["linkman"].replace('未填写', '')
        else:
            item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"]:
            item["phone"] = self.cw.search_phone_num(item["phone"])
        else:
            item["phone"] = ''

        if item["telephone"]:
            item["telephone"] = self.cw.search_telephone_num(item["telephone"])
        else:
            item["telephone"] = ''

        if item["contact_Fax"]:
            item["contact_Fax"] = self.cw.search_contact_Fax(
                item["contact_Fax"])
        else:
            item["contact_Fax"] = ''

        if item["E_Mail"]:
            item["E_Mail"] = self.cw.search_email(item["E_Mail"])
        else:
            item["E_Mail"] = ''

        if item["contact_QQ"]:
            item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
        else:
            item["contact_QQ"] = ''

        if item["company_address"]:
            item["company_address"] = self.cw.search_address(
                item["company_address"])
        else:
            item["company_address"] = ''

        if city_infos:
            if '/' in city_infos:
                try:
                    item["province"] = city_infos.split('/')[0]
                    item["city_name"] = city_infos.split('/')[1]
                except:
                    item["province"] = ''
                    item["city_name"] = ''
            else:
                item["province"] = ''
                item["city_name"] = ''
        else:
            item["province"] = ''
            item["city_name"] = ''

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 6

0

Exibir arquivo

Arquivo: skjcsc.py Projeto: cogitozz/Big_Scrapy_Spider

class ShuKongJiChuanShiChangWangSpider(CrawlSpider):
    name = "skjcsc"
    allowed_domains = ['www.skjcsc.com','skjcsc.com']
    start_urls = ['http://www.skjcsc.com/enterprisefront/enterpriseFrontAction.action']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.5,
        'ITEM_PIPELINES': {'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302},
        'DEFAULT_REQUEST_HEADERS': {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420,
        },
    }
    # /c3847/p2/
    rules = (
        Rule(LinkExtractor(
            allow=r".*",restrict_xpaths=("///div[@id='newproductlist']//tr//div[@id='newp-name']//a")),callback="parse_items",follow=True),

        Rule(LinkExtractor(
            allow=r".*", restrict_xpaths=("//a[contains(text(),'下一页')]")), follow=True),
    )

    def parse_items(self, response):
        item = ShuKongJiChuanShiChangWangItem()
        pattern = re.compile(r'<meta name="keywords" content=".*?,(.*?)"/>',re.S)
        pattern_qq  = re.compile(r'(\d+)@qq.com',re.S)
        item["company_Name"] = response.xpath("//td[contains(text(),'公司名称：')]/following-sibling::td/text()").extract_first()
        item["company_address"] = response.xpath("//td[contains(text(),'详细地址：')]/following-sibling::td/text()").extract_first()
        item["linkman"] = response.xpath("//td[contains(text(),'联 系 人：')]/following-sibling::td/text()").extract_first()
        item["telephone"] = response.xpath("///td[contains(text(),'电　　话：')]/following-sibling::td/text()").extract_first()
        item["phone"] = response.xpath("//td[contains(text(),'手　　机：')]/following-sibling::td/text()").extract_first()
        item["contact_Fax"] = response.xpath("//td[contains(text(),'传　　真：')]/following-sibling::td/text()").extract_first()
        item["contact_QQ"] = "".join(re.findall(pattern_qq,response.text)) if re.findall(pattern_qq,response.text) else ''
        item["E_Mail"] = response.xpath("//td[contains(text(),'邮　　箱：')]/following-sibling::td/text()").extract_first()
        item["Source"] = response.url
        item["kind"] = ",".join(response.xpath("//div[@class='box_bg']//ul//li//a//text()").getall())
        city_infos = response.xpath("//td[contains(text(),'详细地址：')]/following-sibling::td/text()").extract_first()


        if item["company_Name"]:
            item["company_Name"] = self.cw.search_company(item["company_Name"])
        else:
            item["company_Name"] = ''
            return
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"]:
            item["kind"] = item["kind"].replace(" ", '|')
            item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营|主营项目：', '', item["kind"]).replace('-', '|')\
                .replace('、', '|').replace(',', '|').replace('，', '|').replace(';','|').replace('.', '').strip()
        else:
            item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

        if item["linkman"]:
            item["linkman"] = self.cw.search_linkman(item["linkman"])
        else:
            item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"]:
            item["phone"] = self.cw.search_phone_num(item["phone"])
        else:
            item["phone"] = ''

        if item["telephone"]:
            item["telephone"] = self.cw.search_telephone_num(item["telephone"])
        else:
            item["telephone"] = ''

        if item["contact_Fax"]:
            item["contact_Fax"] = self.cw.search_contact_Fax(item["contact_Fax"])
        else:
            item["contact_Fax"] = ''

        if item["E_Mail"]:
            item["E_Mail"] = self.cw.search_email(item["E_Mail"])
        else:
            item["E_Mail"] = ''

        if item["contact_QQ"]:
            item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
        else:
            item["contact_QQ"] = ''

        if item["company_address"]:
            item["company_address"] = item["company_address"].replace("联系地址：","")
            item["company_address"] = self.cw.search_address(item["company_address"])
        else:
            item["company_address"] = ''

        if city_infos:
            if '/' in city_infos:
                try:
                    item["province"] = city_infos.split('/')[0]
                    item["city_name"] = city_infos.split('/')[1]
                except:
                    item["province"] = ''
                    item["city_name"] = ''
            else:
                item["province"] = ''
                item["city_name"] = ''
        else:
            item["province"] = ''
            item["city_name"] = ''

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 7

0

Exibir arquivo

Arquivo: rdgc.py Projeto: cogitozz/Big_Scrapy_Spider

class RouDianGongChengSpider(CrawlSpider):
    name = 'rdgc'
    allowed_domains = ['rdzjw.com', 'www.rdzjw.com']
    start_urls = ['http://www.rdzjw.com/company/']
    cw = CleanWords()

    custom_settings = {
        'DOWNLOAD_DELAY': 1,
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate, br",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control":
            "max-age=0",
            "Connection":
            "keep-alive",
            # "Cookie": "Hm_lvt_081b84205c0f16480d7a8964a70f6b6b=1565055009; BAIDU_SSP_lcr=http://hao.huangye88.com/b2b_42621.html; Hm_lpvt_081b84205c0f16480d7a8964a70f6b6b=1565055034",
            # "Host": "www.fashangji.com",
            # "Referer": "https://www.fashangji.com/",
            # "Sec-Fetch-Mode": "navigate",
            # "Sec-Fetch-Site": "none",
            # "Sec-Fetch-User": "******",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 543,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543,
        }
    }

    def parse(self, response):
        a_list = response.xpath(
            "//div[@class='m']//div[@class='left_box']//td//a")
        for a in a_list:
            kind_name = a.xpath("./text()").extract_first()
            kind_href = a.xpath("./@href").extract_first()
            if kind_name is None:
                kind_name = a.xpath("./strong/text()").extract_first()
            if kind_href:
                print(kind_name, kind_href)
                yield scrapy.Request(url=kind_href,
                                     callback=self.parse_company_list,
                                     dont_filter=True)

    def parse_company_list(self, response):
        div_list = response.xpath(
            "//div[@class='m']//div[@class='list']//table//tr")
        for div in div_list:
            item = RouDianGongChengspiderItem()
            # pattern = re.compile(r'\[\(.*?\)\/\(.*?\)\]', re.S)
            item["company_Name"] = div.xpath(
                ".//td[@align='left']//li/a/strong/text()").extract_first()
            company_href = div.xpath(
                ".//td[@align='left']//li/a/@href").extract_first()
            item["kind"] = div.xpath(
                ".//td[@align='left']//li[contains(text(),'主营：')]/text()"
            ).extract_first()
            city_infos = "".join(
                div.xpath("//td[@align='left']/following-sibling::td/text()").
                extract_first())

            if city_infos and "/" in city_infos:
                # 广东/潮州市
                try:
                    item["province"] = city_infos.replace("[", '').replace(
                        "]", '').split('/')[0]
                    item["city_name"] = city_infos.replace("[", '').replace(
                        "]", '').split('/')[-1]
                except:
                    item["province"] = ''
                    item["city_name"] = ''
            else:
                item["province"] = city_infos.replace("[", '').replace("]", '')
                item["city_name"] = ''
            if company_href:
                # print(company_href)
                contact_href = company_href + "contact/"
                yield scrapy.Request(url=contact_href,
                                     callback=self.parse_company_contact,
                                     meta={"item": item},
                                     dont_filter=True)

        next_page_url = response.xpath(
            "//div[@class='pages']//a[contains(text(),'下一页»')]/@href"
        ).extract_first()
        if next_page_url:
            yield scrapy.Request(url=next_page_url,
                                 callback=self.parse_company_list)

    def parse_company_contact(self, response):
        headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control":
            "max-age=0",
            "Connection":
            "keep-alive",
            "Cookie":
            "URnU_6599_saltkey=ak6gp656; URnU_6599_lastvisit=1565058750; BAIDU_SSP_lcr=https://www.baidu.com/link?url=cb-lCyEOCCLgFwHPL2dAanIBMLf8DvDyItiG2Ov4tBa&wd=&eqid=9f23e6d6000403b3000000065d48f4ac; UM_distinctid=16c64fc50e79b-0e9495f2b9b774-5a13331d-1fa400-16c64fc50e868; CNZZDATA1254919241=1514860467-1565058564-null%7C1565058564; URnU_6599_lastact=1565062639%09api.php%09js",
            "Host":
            "www.rdzjw.com",
            "Referer":
            response.url,
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        }
        item = response.meta["item"]
        item["company_Name"] = response.xpath(
            "//td[contains(text(),'公司名称：')]/following-sibling::td/text()"
        ).extract_first()
        # item["company_id"] = md5(item["company_Name"].encode()).hexdigest()
        # item["kind"] = response.xpath("//div[@class='head']/h4/text()").extract_first()
        item["company_address"] = "".join(
            response.xpath(
                "//td[contains(text(),'公司地址：')]/following-sibling::td/text()").
            extract())
        item["linkman"] = "".join(
            response.xpath(
                "//td[contains(text(),'联 系 人：')]/following-sibling::td/text()"
            ).extract())
        item["telephone"] = "".join(
            response.xpath(
                "//td[contains(text(),'公司电话：')]/following-sibling::td/img/@src"
            ).extract())
        item["phone"] = "".join(
            response.xpath(
                "//td[contains(text(),'手机号码：')]/following-sibling::td/img/@src"
            ).extract())
        item["contact_Fax"] = "".join(
            response.xpath(
                "//td[contains(text(),'公司传真：')]/following-sibling::td/text()").
            extract())
        item["contact_QQ"] = "".join(
            response.xpath("//img[@title='点击QQ交谈/留言']/../@href").extract())
        item["E_Mail"] = "".join(
            response.xpath(
                "//td[contains(text(),'电子邮件：')]/following-sibling::td/img/@src"
            ).extract())
        item["Source"] = response.url

        if item["company_Name"]:
            item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：', '',
                                          item["company_Name"]).replace(
                                              ' ', '').strip()
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"]:
            item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \
                .replace('，', '|').replace('，', '|').replace('.', '').strip()
        else:
            item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

        if item["linkman"]:
            item["linkman"] = item["linkman"]
        else:
            item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"]:
            item["phone"] = self.requests_href(item["phone"], headers)
            item["phone"] = self.cw.search_phone_num(item["phone"])
        else:
            item["phone"] = ''

        if item["telephone"]:
            item["telephone"] = self.requests_href(item["telephone"], headers)
            item["telephone"] = self.cw.search_telephone_num(item["telephone"])
        else:
            item["telephone"] = ''

        if item["contact_Fax"]:
            item["contact_Fax"] = self.requests_href(item["contact_Fax"],
                                                     headers)
            item["contact_Fax"] = self.cw.search_contact_Fax(
                item["contact_Fax"])
        else:
            item["contact_Fax"] = ''

        if item["E_Mail"]:
            item["E_Mail"] = self.requests_href(item["E_Mail"], headers)
            if item["E_Mail"]:
                item["E_Mail"] = item["E_Mail"].replace("e", "@").replace(
                    "8126", "@126").replace("8163", "@163").strip()
            # item["E_Mail"] = self.cw.search_email(item["E_Mail"])
        else:
            item["E_Mail"] = ''

        if item["contact_QQ"]:
            item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
        else:
            item["contact_QQ"] = ''

        if item["company_address"]:
            item["company_address"] = self.cw.search_address(
                item["company_address"])
        else:
            item["company_address"] = ''

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

    def requests_href(self, url, headers):
        res = requests.get(url=url, headers=headers, timeout=20, verify=False)
        res.encoding = "utf-8"
        if res.status_code == requests.codes.ok:
            img = res.content
            something_img_file_path = r"F:\PythonProjects\venv\pythonProjects\BigB2BSpider\BigB2BSpider\img_src\something_img2\image.png"
            with open(something_img_file_path, "wb") as fp:
                fp.write(img)
            fp.close()
            if img:
                try:
                    something = recognition_image(something_img_file_path)
                    if something:
                        return something
                    else:
                        return ''
                except:
                    return ''
            else:
                return ''
        else:
            return ''

Exemplo n.º 8

0

Exibir arquivo

class GouLianZiYuanWangSpider(CrawlSpider):
    name = "ibicn"
    # allowed_domains = ['www.53info.com','qineng1688.53info.com']
    start_urls = ['https://shangji.ibicn.com/gongsi/s_i_d_t_l_k/']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.3,
        'ITEM_PIPELINES': {
            'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302
        },
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            # "Connection": "keep-alive",
            # "Cookie": "Hm_lvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566177749; CCKF_visitor_id_92126=1219631166; yunsuo_session_verify=db1a03528b7dfe197918cf533946c447; bdshare_firstime=1566178685689; Hm_lpvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566178686",
            # "Host": "jamesni139.tybaba.com",
            # "Referer": "http://jamesni139.tybaba.com/",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420,
        },
        # 不验证SSL证书
        "DOWNLOAD_HANDLERS_BASE": {
            'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler',
            'http': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler',
            'https':
            'scrapy.core.downloader.handlers.http.HttpDownloadHandler',
            's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler',
        },
        # "DOWNLOAD_HANDLERS": {
        #     'https': 'BigB2BSpider.custom.downloader.handler.https.HttpsDownloaderIgnoreCNError'},
    }
    # /c3847/p2/
    rules = (
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=("//div[@class='pull-left right']//li//a")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=
            ("//div[@id='main']//li[@class='item ']//div[@class='pull-left left text-ellipsis']//a"
             )),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=("//div[@id='pages']//a[contains(text(),'下一页')]")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=("//div[@id='nav']//a[contains(text(),'联系我们')]")),
             callback='parse_items',
             follow=False),
    )

    def parse_items(self, response):
        pattern = re.compile(r'主要经营(.*?)\。', re.S)
        pattern1 = re.compile(r'是[一家](.*?)的[高新|技术|企业|公司]\。', re.S)
        pattern2 = re.compile(r'>地址：(.*?)<', re.S)
        pattern3 = re.compile(r'<span>电 话：</span><b>(.*?)\s*</b>', re.S)
        pattern4 = re.compile(r'<span>手 机：</span><b>(.*?)\s*</b>', re.S)
        pattern5 = re.compile(r'<span>联系人：</span> <b>(.*?)\s*</b>', re.S)

        item = GouLianZiYuanWangItem()
        contact_infos = ",".join(
            response.xpath(
                "//div[@class='contact']//div[@class='item']//span").getall())
        item["company_Name"] = response.xpath(
            "//div[@class='copany_name']/a/@title").extract_first()
        item["kind"] = "".join(re.findall(pattern,
                                          response.text)) if re.findall(
                                              pattern, response.text) else ''
        item["company_address"] = "".join(
            re.findall(pattern2, response.text)) if re.findall(
                pattern2, response.text) else ''
        item["linkman"] = "".join(
            re.findall(pattern5, response.text)) if re.findall(
                pattern5, response.text) else ''
        item["telephone"] = "".join(
            re.findall(pattern3, response.text)) if re.findall(
                pattern3, response.text) else ''
        item["phone"] = "".join(re.findall(pattern4,
                                           response.text)) if re.findall(
                                               pattern4, response.text) else ''
        item["contact_Fax"] = item["telephone"] if item["telephone"] else ''
        item["contact_QQ"] = ''
        item["E_Mail"] = ''
        item["Source"] = response.url
        item["province"] = ''
        item["city_name"] = ''

        if item["company_Name"]:
            item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：', '',
                                          item["company_Name"]).replace(
                                              ' ', '').strip()
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"]:
            item["kind"] = item["kind"].replace(' ', '|')
            item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \
                .replace(',', '|').replace('，', '|').replace('.', '').strip()
        else:
            try:
                item["kind"] = "".join(re.findall(
                    pattern1, response.text)) if re.findall(
                        pattern1, response.text) else ''
                item["kind"] = item["kind"].replace(' ', '|')
                item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \
                    .replace(',', '|').replace('，', '|').replace('.', '').strip()
            except:
                item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

        if item["linkman"]:
            item["linkman"] = item["linkman"]
        else:
            item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"]:
            item["phone"] = self.cw.search_phone_num(item["phone"])
        else:
            try:
                item["phone"] = self.cw.search_phone_num(contact_infos)
            except:
                item["phone"] = ''

        if item["telephone"]:
            item["telephone"] = self.cw.search_telephone_num(item["telephone"])
        else:
            try:
                item["telephone"] = self.cw.search_telephone_num(contact_infos)
            except:
                item["telephone"] = ''

        if item["contact_Fax"]:
            item["contact_Fax"] = self.cw.search_contact_Fax(
                item["contact_Fax"])
        else:
            item["contact_Fax"] = ''

        if item["E_Mail"]:
            item["E_Mail"] = self.cw.search_email(item["E_Mail"])
        else:
            item["E_Mail"] = ''

        if item["contact_QQ"]:
            item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
        else:
            item["contact_QQ"] = ''

        if item["company_address"]:
            item["company_address"] = self.cw.search_address(
                item["company_address"])
        else:
            item["company_address"] = ''

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 9

0

Exibir arquivo

class BaFangZiYuanWangSpider(CrawlSpider):
    name = "b2b168"
    allowed_domains = ['b2b168.com','www.b2b168.com']
    start_urls = ['https://www.b2b168.com/page-company.html']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.5,
        'ITEM_PIPELINES': {'BigB2BSpider.pipelines.MysqlTwistedPiplines': 302},
        'DEFAULT_REQUEST_HEADERS': {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Connection": "keep-alive",
            # "Cookie": "Hm_lvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566177749; CCKF_visitor_id_92126=1219631166; yunsuo_session_verify=db1a03528b7dfe197918cf533946c447; bdshare_firstime=1566178685689; Hm_lpvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566178686",
            # "Host": "jamesni139.tybaba.com",
            # "Referer": "http://jamesni139.tybaba.com/",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543,
        },
        # 不验证SSL证书
        "DOWNLOAD_HANDLERS_BASE": {
            'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler',
            'http': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler',
            'https': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler',
            's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler',
        },
        # "DOWNLOAD_HANDLERS": {
        #     'http': 'BigB2BSpider.custom.downloader.handler.https.HttpsDownloaderIgnoreCNError'},
    }

    rules = (
        Rule(LinkExtractor(
            allow=r".*",restrict_xpaths=("//div[@class='map']//ul[contains(@class,'c-hangye')]//li//a")), follow=True),

        Rule(LinkExtractor(
            allow=r".*",restrict_xpaths=("//div[@class='mach_list clearfix']//dd//a")), follow=True),

        Rule(LinkExtractor(
            allow=r".*", restrict_xpaths=("//div[@class='list-right']//ul[@class='list']//li//div[1]//a")), follow=True),

        Rule(LinkExtractor(
            allow=r".*",restrict_xpaths=("//div[@class='pages']//a[contains(text(),'下页')]")), follow=True),

        Rule(LinkExtractor(
            allow=r".*",restrict_xpaths=("//a[contains(text(),'联系方式')]")),callback='parse_items', follow=False),
    )

    def parse_items(self, response):
        pattern = re.compile(r'<meta name="description" content="(.*?)"/>',re.S)
        pattern1 = re.compile(r'<span>主营：</span>(.*?)</p>',re.S)
        pattern2 = re.compile(r'<div class="com-name">(.*?)</div>',re.S)
        pattern3 = re.compile(r'联 系 人： <a class=b2>(.*?)</a>',re.S)
        pattern4 = re.compile(r'电　　话： (.*?)<br />',re.S)
        pattern5 = re.compile(r'传　　真： (.*?)<br />', re.S)
        pattern6 = re.compile(r'移动电话： (.*?)<br />', re.S)
        pattern7 = re.compile(r'地　　址： (.*?)<br />', re.S)
        pattern8 = re.compile(r'主要经营(.*?)<br />', re.S)

        pattern9 = re.compile(r'<ul class="company">(.*?)</ul>', re.S)
        pattern10 = re.compile(r'主要经营(.*?)<br />', re.S)
        pattern11 = re.compile(r'ShowMap\("divMap","(.*?)","(.*?)", "(.*?)"\);', re.S)
        pattern12 = re.compile(r'>地址：(.*?) <a', re.S)
        pattern13 = re.compile(r'<dt>固定电话：</dt><dd>(.*?)</dd>', re.S)
        pattern14 = re.compile(r'<dt>联系人：</dt><dd>(.*?)</dd>', re.S)
        pattern15 = re.compile(r'<dt>移动电话：</dt><dd>(.*?)</dd>', re.S)
        pattern16 = re.compile(r'<dt>传真号码：</dt><dd>(.*?)</dd>', re.S)

        if response.status == 200:
            try:
                item = BaFangZiYuanWangspiderItem()
                item["company_Name"] = re.findall(pattern11,response.text)[0][0] if re.findall(pattern11,response.text) else ''
                item["company_address"] = re.findall(pattern11,response.text)[0][1] if re.findall(pattern11,response.text) else ''
                item["linkman"] = "".join(re.findall(pattern3,response.text)) if re.findall(pattern3,response.text) else ''
                item["telephone"] = "".join(re.findall(pattern4,response.text)) if re.findall(pattern4,response.text) else ''
                item["phone"] = "".join(re.findall(pattern6,response.text)) if re.findall(pattern6,response.text) else ''
                item["contact_Fax"] = "".join(re.findall(pattern5,response.text)) if re.findall(pattern5,response.text) else ''
                item["contact_QQ"] = ''
                item["E_Mail"] = ''
                item["kind"] = ",".join(re.findall(pattern1,response.text) if re.findall(pattern1,response.text) else '')
                item["Source"] = response.url
                item["province"] = re.findall(pattern11,response.text)[0][2].split(' ')[0] if re.findall(pattern11,response.text) else ''
                item["city_name"] = re.findall(pattern11,response.text)[0][2].split(' ')[1] if re.findall(pattern11,response.text) else ''

                if item["company_Name"]:
                    item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：', '', item["company_Name"]).replace(' ', '').strip()
                item["company_id"] = get_md5(item["company_Name"])

                if item["kind"]:
                    item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \
                        .replace(',', '|').replace('，', '|').replace('.', '').strip()
                else:
                    try:
                        item["kind"] = "".join(re.findall(pattern10,response.text)) if re.findall(pattern10,response.text) else ''
                    except:
                        item["kind"] = ''

                item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

                if item["linkman"]:
                    item["linkman"] = item["linkman"]
                else:
                    try:
                        item["linkman"] = "".join(re.findall(pattern14,response.text)) if re.findall(pattern14,response.text) else ''
                    except:
                        item["linkman"] = ''
                item["linkman"] = self.cw.search_linkman(item["linkman"])

                if item["phone"]:
                    item["phone"] = self.cw.search_phone_num(item["phone"])
                else:
                    try:
                        item["phone"] = "".join(re.findall(pattern15,response.text)) if re.findall(pattern15,response.text) else ''
                    except:
                        item["phone"] = ''

                item["phone"] = self.cw.search_phone_num(item["phone"])

                if item["telephone"]:
                    item["telephone"] = self.cw.search_telephone_num(item["telephone"])
                else:
                    try:
                        item["telephone"] = "".join(re.findall(pattern13, response.text)) if re.findall(pattern13, response.text) else ''
                    except:
                        item["telephone"] = ''

                item["telephone"] = self.cw.search_telephone_num(item["telephone"])

                if item["contact_Fax"]:
                    item["contact_Fax"] = self.cw.search_contact_Fax(item["contact_Fax"])
                else:
                    try:
                        item["contact_Fax"] = "".join(re.findall(pattern16, response.text)) if re.findall(pattern16, response.text) else ''
                    except:
                        item["contact_Fax"] = ''

                item["contact_Fax"] = self.cw.search_contact_Fax(item["contact_Fax"])

                if item["E_Mail"]:
                    item["E_Mail"] = self.cw.search_email(item["E_Mail"])
                else:
                    item["E_Mail"] = ''

                if item["E_Mail"]:
                    try:
                        item["contact_QQ"] = self.cw.search_QQ(item["E_Mail"])
                    except:
                        item["contact_QQ"] = ''
                else:
                    item["contact_QQ"] = ''

                if item["company_address"]:
                    item["company_address"] = self.cw.search_address(item["company_address"])
                else:
                    item["company_address"] = ''

                yield item
            except:
                return

Exemplo n.º 10

0

Exibir arquivo

class BaiChuangHuangYeWangSpider(CrawlSpider):
    name = "ayijx"
    allowed_domains = ['www.ayijx.com']
    start_urls = ['http://www.ayijx.com/area/']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.3,
        'ITEM_PIPELINES': {
            'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302
        },
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420,
        },
    }

    rules = (
        Rule(LinkExtractor(allow=r".*",
                           restrict_xpaths=("//div[@class='listsum']//dl//a")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=
            ("//div[@class='dqmqsumxdtb margintop']//li//div[@class='dqmqlefts']//a"
             )),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@class='navbox']//a[contains(text(),'联系我们')]")),
             callback='parse_items',
             follow=True),
        Rule(LinkExtractor(allow=r".*",
                           restrict_xpaths=("//div[@class='fanye']//p//a")),
             follow=True),
    )

    def parse_items(self, response):
        item = BaiChuangHuangYeWangItem()
        pattern = re.compile(r'<title>联系我们_(.*?)</title>', re.S)
        pattern1 = re.compile(r'<p>电话： (.*?)</p>', re.S)
        pattern2 = re.compile(r'<p>手机： (.*?)</p>', re.S)
        pattern3 = re.compile(r'<p>Q Q： (.*?)</p>', re.S)
        pattern4 = re.compile(r'<p>联系人：(.*?)</p>', re.S)
        pattern5 = re.compile(r'<div class="LOGO_lfont">\s*<p>(.*?)</p>', re.S)

        pattern6 = re.compile(r'>联 系 人：(.*?)<', re.S)
        pattern7 = re.compile(r'> 电    话：(.*?)<', re.S)
        pattern8 = re.compile(r'>手    机： (.*?)<', re.S)
        pattern9 = re.compile(r'> Q    Q：(.*?)<', re.S)
        pattern10 = re.compile(r'>传    真：(.*?)<', re.S)
        pattern11 = re.compile(r'>邮    箱：(.*?)<', re.S)
        pattern12 = re.compile(r'>地    址：(.*?)<', re.S)
        pattern13 = re.compile(r'>企业官网：(.*?)<', re.S)
        pattern14 = re.compile(r'>\s*主营产品：(.*?)<', re.S)

        if response.text is not None:
            try:
                item["company_Name"] = "".join(
                    re.findall(pattern, response.text)) if re.findall(
                        pattern, response.text) else ''
                item["kind"] = "".join(re.findall(
                    pattern14, response.text)) if re.findall(
                        pattern14, response.text) else ''
                item["company_address"] = "".join(
                    re.findall(pattern12, response.text)) if re.findall(
                        pattern12, response.text) else ''
                item["linkman"] = "".join(re.findall(
                    pattern6, response.text)) if re.findall(
                        pattern6, response.text) else ''
                item["telephone"] = "".join(re.findall(
                    pattern7, response.text)) if re.findall(
                        pattern7, response.text) else ''
                item["phone"] = "".join(re.findall(
                    pattern8, response.text)) if re.findall(
                        pattern8, response.text) else ''
                item["contact_Fax"] = "".join(
                    re.findall(pattern10, response.text)) if re.findall(
                        pattern10, response.text) else ''
                item["contact_QQ"] = "".join(
                    re.findall(pattern9, response.text)) if re.findall(
                        pattern9, response.text) else ''
                item["E_Mail"] = "".join(re.findall(
                    pattern11, response.text)) if re.findall(
                        pattern11, response.text) else ''
                item["Source"] = response.url
                item["province"] = ""
                item["city_name"] = ""

                if item["company_Name"]:
                    if "（" in item["company_Name"]:
                        item["company_Name"] = item["company_Name"].split(
                            '（')[0]
                    elif "(" in item["company_Name"]:
                        item["company_Name"] = item["company_Name"].split(
                            '(')[0]
                    elif "_" in item["company_Name"]:
                        item["company_Name"] = item["company_Name"].split(
                            '_')[0]
                    elif "-" in item["company_Name"]:
                        item["company_Name"] = item["company_Name"].split(
                            '-')[0]
                    else:
                        item["company_Name"] = re.sub(
                            r'\n|\s|\r|\t|公司名称：', '',
                            item["company_Name"]).replace(' ', '').strip()
                item["company_id"] = self.get_md5(item["company_Name"])

                if item["kind"]:
                    item["kind"] = item["kind"].replace(' ', '|')
                    item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营|主营项目：', '', item["kind"]).replace('-', '|').replace('、', '|')\
                        .replace(',', '|').replace('，', '|').replace(';','|').replace('.', '').strip()
                else:
                    try:
                        item["kind"] = "|".join(
                            response.xpath(
                                "//div[@class='hotico']//ul//li//a//text()").
                            getall())
                        item["kind"] = item["kind"].replace(' ', '|')
                        item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营|主营项目：', '', item["kind"]).replace('-','|')\
                            .replace('、','|').replace(',', '|').replace('，', '|')\
                            .replace(';', '|').replace('.', '').strip()
                    except:
                        item["kind"] = ''

                item["kind"] = self.cw.rinse_keywords(
                    self.cw.replace_ss(item["kind"]))

                if item["linkman"]:
                    item["linkman"] = item["linkman"]
                else:
                    try:
                        item["linkman"] = "".join(
                            re.findall(pattern4, response.text)) if re.findall(
                                pattern4, response.text) else ''
                    except:
                        item["linkman"] = ''

                if item["linkman"]:
                    if '<' in item["linkman"]:
                        item["linkman"] = item["linkman"].split('<')[0]
                item["linkman"] = self.cw.search_linkman(item["linkman"])

                if item["telephone"]:
                    item["telephone"] = item["telephone"]
                else:
                    try:
                        item["telephone"] = "".join(
                            re.findall(pattern1, response.text)) if re.findall(
                                pattern1, response.text) else ''
                    except:
                        item["telephone"] = ''
                item["telephone"] = self.cw.search_telephone_num(
                    item["telephone"])

                if item["phone"]:
                    item["phone"] = item["phone"]
                else:
                    try:
                        item["phone"] = "".join(
                            re.findall(pattern1, response.text)) if re.findall(
                                pattern1, response.text) else ''
                    except:
                        item["phone"] = ''
                item["phone"] = self.cw.search_phone_num(item["phone"])

                if item["contact_Fax"]:
                    item["contact_Fax"] = self.cw.search_contact_Fax(
                        item["contact_Fax"])
                else:
                    item["contact_Fax"] = ''

                if item["E_Mail"]:
                    item["E_Mail"] = self.cw.search_email(item["E_Mail"])
                else:
                    item["E_Mail"] = ''

                if item["contact_QQ"]:
                    item["contact_QQ"] = item["contact_QQ"].replace("Q Q：", '')
                else:
                    try:
                        item["contact_QQ"] = "".join(
                            re.findall(pattern3, response.text)) if re.findall(
                                pattern3, response.text) else ''
                    except:
                        item["contact_QQ"] = ''
                item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])

                if item["company_address"]:
                    item["company_address"] = self.cw.search_address(
                        item["company_address"])
                else:
                    item["company_address"] = ''

                yield item

            except:
                return

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 11

0

Exibir arquivo

Arquivo: ccen_v1.py Projeto: cogitozz/Big_Scrapy_Spider

class ZhongGuoHuaGongSheBeiWangSpider(CrawlSpider):
    name = "ccen_v1"
    allowed_domains = ['www.ccen.net', 'ccen.net']
    start_urls = ['http://www.ccen.net/company/']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.4,
        'ITEM_PIPELINES': {
            'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302
        },
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420,
        },
    }

    def parse(self, response):
        a_list = response.xpath(
            "//table[@class='martop']//table[@class='ccen_blueborder']//table[@style='margin-bottom:5px']//tr//a"
        )
        for a in a_list:
            kind_name = a.xpath("./text()").get()
            kind_href = a.xpath("./@href").get()
            if kind_href:
                kind_href = "http://www.ccen.net/company/" + kind_href
                # print(kind_name,kind_href)
                yield scrapy.Request(url=kind_href,
                                     callback=self.parse_company_list,
                                     dont_filter=True)

    def parse_company_list(self, response):
        t_list = response.xpath(
            "//table[@style='margin-top:10px; border-bottom:1px solid #CCCCCC; padding:5px;']//td[@valign='top']"
        )
        for t in t_list:
            item = ZhongGuoHuaGongSheBeiWangItem()
            # ',,,联系人：,树伟,电话：,025-57467888,手机：,15806100000,更多联系方式>>,,,'
            pattern = re.compile(r'>联系人：</span>(.*?)				　<', re.S)
            pattern1 = re.compile(r'>电话：</span>(.*?)                　<', re.S)
            pattern2 = re.compile(r'>手机：</span>(.*?)                　<', re.S)
            pattern3 = re.compile(r'>主营：</span>(.*?)</td>', re.S)
            pattern4 = re.compile(
                r'<a href="(.*?)" target="_blank">更多联系方式\&gt;&gt;</a>', re.S)
            item["company_Name"] = t.xpath(
                ".//table[1]//a[@class='blue f14']/@title").get()
            city_infos = t.xpath(
                ".//table[1]//a[@class='blue f14']/../text()").get()
            linkinfos = "".join(t.xpath(".//table[3]").getall())
            item["kind"] = "".join(re.findall(
                pattern3, response.text)) if re.findall(
                    pattern3, response.text) else ''
            item["linkman"] = "".join(re.findall(
                pattern, linkinfos)) if re.findall(pattern, linkinfos) else ''
            item["telephone"] = "".join(
                re.findall(pattern1, linkinfos)) if re.findall(
                    pattern1, linkinfos) else ''
            item["phone"] = "".join(re.findall(pattern2,
                                               linkinfos)) if re.findall(
                                                   pattern2, linkinfos) else ''
            if city_infos:
                #  [湖北省-武汉]
                pattern_p = re.compile(r'\[(.*?)-.*?\]', re.S)
                pattern_c = re.compile(r'\[.*?-(.*?)\]', re.S)
                if "[" and "-" and "]" in city_infos:
                    try:
                        item["province"] = "".join(
                            re.findall(pattern_p, city_infos)) if re.findall(
                                pattern_p, city_infos) else ''
                        item["city_name"] = "".join(
                            re.findall(pattern_c, city_infos)) if re.findall(
                                pattern_c, city_infos) else ''
                    except:
                        item["province"] = ''
                        item["city_name"] = ''
                else:
                    item["province"] = ''
                    item["city_name"] = ''
            else:
                item["province"] = ''
                item["city_name"] = ''

            company_href = "".join(re.findall(pattern4,
                                              linkinfos)[0]) if re.findall(
                                                  pattern4, linkinfos) else ''
            if company_href:
                yield scrapy.Request(url=company_href,
                                     callback=self.parse_company_detail,
                                     meta={"item": item},
                                     dont_filter=True)

        next_page_url = response.xpath(
            "//table[@class='membertable_page'][1]//a[contains(text(),'下一页')]/@href"
        ).get()
        if next_page_url:
            next_page_url = "http://www.ccen.net" + next_page_url
            yield scrapy.Request(url=next_page_url,
                                 callback=self.parse_company_list)

    def parse_company_detail(self, response):
        item = response.meta["item"]
        contact_href = response.xpath(
            "//font[contains(text(),'联系我们')]/../../@href").get()
        if contact_href:
            contact_href = response.url + contact_href
            yield scrapy.Request(url=contact_href,
                                 callback=self.parse_company_contact,
                                 dont_filter=True)
        else:
            pattern_add = re.compile(r'>\s*详细地址：(.*?)<', re.S)
            pattern_em = re.compile(r'>\s*电子邮件：(.*?)<', re.S)
            pattern_fa = re.compile(r'>\s*传真：(.*?)<', re.S)
            pattern_k = re.compile(r'<p>主营产品： (.*?)<br />', re.S)
            item["company_Name"] = item["company_Name"]
            item["company_address"] = "".join(
                re.findall(pattern_add, response.text)) if re.findall(
                    pattern_add, response.text) else ''
            item["linkman"] = item["linkman"]
            item["telephone"] = item["telephone"]
            item["phone"] = item["phone"]
            item["contact_Fax"] = "".join(re.findall(
                pattern_fa, response.text)) if re.findall(
                    pattern_fa, response.text) else ''
            item["contact_QQ"] = response.xpath(
                "//a[contains(@title,'点击QQ图标在线联系')]/@href").extract_first()
            item["E_Mail"] = "".join(re.findall(
                pattern_em, response.text)) if re.findall(
                    pattern_em, response.text) else ''
            item["Source"] = response.url
            item["kind"] = ",".join(re.findall(
                pattern_k, response.text)) if re.findall(
                    pattern_k, response.text) else ''
            # city_infos = response.xpath("//dt[contains(text(),'所在地区：')]/following-sibling::dd/text()").get()

            if item["company_Name"] and item["company_Name"] != '':
                if "（" in item["company_Name"]:
                    item["company_Name"] = item["company_Name"].split('（')[0]
                elif "(" in item["company_Name"]:
                    item["company_Name"] = item["company_Name"].split('(')[0]
                elif "_" in item["company_Name"]:
                    item["company_Name"] = item["company_Name"].split('_')[0]
                elif "-" in item["company_Name"]:
                    item["company_Name"] = item["company_Name"].split('-')[0]
                else:
                    item["company_Name"] = re.sub(
                        r'\n|\s|\r|\t|公司名称：', '',
                        item["company_Name"]).replace(' ', '').strip()
            else:
                return
            item["company_id"] = self.get_md5(item["company_Name"])

            if item["kind"]:
                item["kind"] = item["kind"].replace(" ", '|')
                item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营|主营项目：', '', item["kind"]).replace('-', '|')\
                    .replace('、', '|').replace(',', '|').replace('，', '|').replace(';','|').replace('.', '').strip()
            else:
                item["kind"] = ''

            item["kind"] = self.cw.rinse_keywords(
                self.cw.replace_ss(item["kind"]))

            if item["linkman"]:
                item["linkman"] = item["linkman"].replace('未填写', '')
            else:
                item["linkman"] = ''
            item["linkman"] = self.cw.search_linkman(item["linkman"])

            if item["phone"]:
                item["phone"] = self.cw.search_phone_num(item["phone"])
            else:
                item["phone"] = ''

            if item["telephone"]:
                item["telephone"] = self.cw.search_telephone_num(
                    item["telephone"])
            else:
                item["telephone"] = ''

            if item["contact_Fax"]:
                item["contact_Fax"] = self.cw.search_contact_Fax(
                    item["contact_Fax"])
            else:
                item["contact_Fax"] = ''

            if item["E_Mail"]:
                item["E_Mail"] = self.cw.search_email(item["E_Mail"])
            else:
                item["E_Mail"] = ''

            if item["contact_QQ"]:
                item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
            else:
                try:
                    item["contact_QQ"] = self.cw.search_QQ(item["E_Mail"])
                except:
                    item["contact_QQ"] = ''

            if item["company_address"]:
                item["company_address"] = self.cw.search_address(
                    item["company_address"])
            else:
                item["company_address"] = ''

            # if city_infos:
            #     if '/' in city_infos:
            #         try:
            #             item["province"] = city_infos.split('/')[0]
            #             item["city_name"] = city_infos.split('/')[1]
            #         except:
            #             item["province"] = ''
            #             item["city_name"] = ''
            #     else:
            #         item["province"] = ''
            #         item["city_name"] = ''
            # else:
            #     item["province"] = ''
            #     item["city_name"] = ''

            yield item

    def parse_company_contact(self, response):
        item = response.meta["item"]
        # pattern = re.compile(r'<title>(.*?) - .*?</title>',re.S)
        # pattern1 = re.compile(r'<p>主营产品： (.*?)<br />',re.S)
        # pattern2 = re.compile(r'>\s*通信地址：(.*?)\&nbsp;',re.S)
        # pattern3 = re.compile(r';\s*电话：(.*?)\&nbsp;',re.S)
        # pattern4 = re.compile(r';\s*传真：(.*?)\s*<', re.S)
        # pattern5 = re.compile(r'>\s*E-mail：(.*?)\&nbsp;', re.S)
        # pattern6 = re.compile(r'>\s*联系人：(.*?)<br />', re.S)
        pattern_k = re.compile(r'>\s*主营产品： (.*?)<', re.S)
        item["company_Name"] = response.xpath(
            "//td[contains(text(),'公司名称：')]/following-sibling::td[2]/text()"
        ).get()
        item["company_address"] = response.xpath(
            "//td[contains(text(),'详细地址：')]/following-sibling::td[2]/text()"
        ).get()
        item["linkman"] = response.xpath(
            "//td[contains(text(),'联 系 人：')]/following-sibling::td[2]/text()"
        ).get()
        item["telephone"] = response.xpath(
            "//td[contains(text(),'公司电话：')]/following-sibling::td[2]/text()"
        ).get()
        item["phone"] = response.xpath(
            "//td[contains(text(),'手　　机：')]/following-sibling::td[2]/text()"
        ).get()
        item["contact_Fax"] = response.xpath(
            "//td[contains(text(),'传　　真：')]/following-sibling::td[2]/text()"
        ).get()
        item["contact_QQ"] = response.xpath(
            "//a[contains(@title,'点击QQ图标在线联系')]/@href").extract_first()
        item["E_Mail"] = response.xpath(
            "//td[contains(text(),'电子邮件：')]/following-sibling::td[2]/text()"
        ).extract_first()
        item["Source"] = response.url
        item["kind"] = ",".join(
            re.findall(pattern_k, response.text)) if re.findall(
                pattern_k, response.text) else ''
        # city_infos = response.xpath("//dt[contains(text(),'所在地区：')]/following-sibling::dd/text()").get()

        if item["company_Name"] and item["company_Name"] != '':
            if "（" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('（')[0]
            elif "(" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('(')[0]
            elif "_" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('_')[0]
            elif "-" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('-')[0]
            else:
                item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：', '',
                                              item["company_Name"]).replace(
                                                  ' ', '').strip()
        else:
            return
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"]:
            item["kind"] = item["kind"].replace(" ", '|')
            item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营|主营项目：', '', item["kind"]).replace('-', '|')\
                .replace('、', '|').replace(',', '|').replace('，', '|').replace(';','|').replace('.', '').strip()
        else:
            item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

        if item["linkman"]:
            item["linkman"] = item["linkman"].replace('未填写', '')
        else:
            item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"]:
            item["phone"] = self.cw.search_phone_num(item["phone"])
        else:
            item["phone"] = ''

        if item["telephone"]:
            item["telephone"] = self.cw.search_telephone_num(item["telephone"])
        else:
            item["telephone"] = ''

        if item["contact_Fax"]:
            item["contact_Fax"] = self.cw.search_contact_Fax(
                item["contact_Fax"])
        else:
            item["contact_Fax"] = ''

        if item["E_Mail"]:
            item["E_Mail"] = self.cw.search_email(item["E_Mail"])
        else:
            item["E_Mail"] = ''

        if item["contact_QQ"]:
            item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
        else:
            try:
                item["contact_QQ"] = self.cw.search_QQ(item["E_Mail"])
            except:
                item["contact_QQ"] = ''

        if item["company_address"]:
            item["company_address"] = self.cw.search_address(
                item["company_address"])
        else:
            item["company_address"] = ''

        # if city_infos:
        #     if '/' in city_infos:
        #         try:
        #             item["province"] = city_infos.split('/')[0]
        #             item["city_name"] = city_infos.split('/')[1]
        #         except:
        #             item["province"] = ''
        #             item["city_name"] = ''
        #     else:
        #         item["province"] = ''
        #         item["city_name"] = ''
        # else:
        item["province"] = item["province"]
        item["city_name"] = item["city_name"]

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 12

0

Exibir arquivo

class CebnDianZiShangWuWangSpider(CrawlSpider):
    name = "cebn"
    allowed_domains = ['www.cebn.cn', 'cebn.cn']
    start_urls = ['http://www.cebn.cn/company/']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.3,
        'ITEM_PIPELINES': {
            'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302
        },
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control":
            "max-age=0",
            "Connection":
            "keep-alive",
            # "Host": "www.kusoba.com",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543,
        }
    }
    # /c3847/p2/
    rules = (
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=("//div[@class='fl J-mainNav']//ul//li//a")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@class='list-content']//div[@class='proname']//a")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@class='pages']//a[contains(text(),'下一页»')]")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=("//span[contains(text(),'联系方式')]/..")),
             callback='parse_items',
             follow=False),
    )

    def parse_items(self, response):
        headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control":
            "max-age=0",
            "Connection":
            "keep-alive",
            "Cookie":
            "UM_distinctid=16cb2979f862a5-0dfda36ee0202d-5a13331d-1fa400-16cb2979f8725e; Hm_lvt_aa1b57052b9004f48376724837cc9b69=1566364377; yunsuo_session_verify=ded7c3ded7b4429e61379b82e2e37d8e; Hm_lpvt_aa1b57052b9004f48376724837cc9b69=1566365005",
            # "Host": "fshjbxg.cn.cebn.cn",
            "Referer":
            response.url,
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        }
        item = CebnDianZiShangWuWangItem()
        if response.text:
            try:
                item["company_Name"] = response.xpath(
                    "//td[contains(text(),'公司名称：')]/following-sibling::td/text()"
                ).extract_first()
                item["kind"] = response.xpath(
                    "//div[@class='head']//h4/text()").get()
                item["company_address"] = "".join(
                    response.xpath(
                        "//td[contains(text(),'公司地址：')]/following-sibling::td/text()"
                    ).extract())
                item["linkman"] = response.xpath(
                    "//td[contains(text(),'联 系 人：')]/following-sibling::td/text()"
                ).extract_first()
                item["telephone"] = response.xpath(
                    "//td[contains(text(),'公司电话：')]/following-sibling::td/img/@src"
                ).extract_first()
                item["phone"] = response.xpath(
                    "//td[contains(text(),'手机号码：')]/following-sibling::td/img/@src"
                ).extract_first()
                item["contact_Fax"] = response.xpath(
                    "//td[contains(text(),'公司传真：')]/following-sibling::td/img/@src"
                ).extract_first()
                item["contact_QQ"] = response.xpath(
                    "//img[@title='点击QQ交谈/留言']/../@href").extract_first()
                item["E_Mail"] = response.xpath(
                    "//td[contains(text(),'电子邮件：')]/following-sibling::td/img/@src"
                ).extract_first()
                item["Source"] = response.url
                item["province"] = ""
                item["city_name"] = ""

                if item["company_Name"]:
                    item["company_Name"] = re.sub(
                        r'\n|\s|\r|\t|公司名称：|全称：', '',
                        item["company_Name"]).replace(' ', '').strip()
                item["company_id"] = self.get_md5(item["company_Name"])

                if item["kind"]:
                    item["kind"] = item["kind"].replace(" ", "|")
                    item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营|主营产品：', '', item["kind"]).replace('-', '|').replace('、', '|') \
                        .replace('，', '|').replace('，', '|').replace('.', '').strip()
                else:
                    item["kind"] = ''

                item["kind"] = self.cw.rinse_keywords(
                    self.cw.replace_ss(item["kind"]))

                if item["linkman"]:
                    if "（" in item["linkman"]:
                        item["linkman"] = item["linkman"].split(
                            "（")[0].replace('法定代表人：', '').replace('暂未公布', '')
                    else:
                        item["linkman"] = item["linkman"].replace(
                            '法定代表人：', '').replace('暂未公布', '')
                else:
                    item["linkman"] = ''
                item["linkman"] = self.cw.search_linkman(item["linkman"])

                if item["phone"]:
                    item["phone"] = self.requests_href(item["phone"], headers)
                    item["phone"] = self.cw.search_phone_num(item["phone"])
                else:
                    item["phone"] = ''

                if item["telephone"]:
                    item["telephone"] = self.requests_href(
                        item["telephone"], headers)
                    item["telephone"] = self.cw.search_telephone_num(
                        item["telephone"])
                else:
                    item["telephone"] = ''

                if item["contact_Fax"]:
                    item["contact_Fax"] = self.requests_href(
                        item["contact_Fax"], headers)
                    item["contact_Fax"] = self.cw.search_contact_Fax(
                        item["contact_Fax"])
                else:
                    item["contact_Fax"] = ''

                if item["E_Mail"]:
                    item["E_Mail"] = self.requests_href(
                        item["E_Mail"], headers)
                    item["E_Mail"] = self.cw.search_email(item["E_Mail"])
                else:
                    item["E_Mail"] = ''

                if item["contact_QQ"]:
                    # item["contact_QQ"] = self.requests_href(item["contact_QQ"], headers)
                    item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
                else:
                    item["contact_QQ"] = ''

                if item["company_address"]:
                    # if "\"" in item["company_address"]:
                    item["company_address"] = item["company_address"]
                    item["company_address"] = self.cw.search_address(
                        item["company_address"])
                else:
                    item["company_address"] = ''

                yield item
            except:
                return

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

    def requests_href(self, url, headers):
        res = requests.get(url=url, headers=headers, timeout=10, verify=False)
        res.encoding = "utf-8"
        if res.status_code == requests.codes.ok:
            img = res.content
            something_img_file_path = r"F:\PythonProjects\venv\pythonProjects\BigB2BSpider\BigB2BSpider\img_src\something_img2\image.png"
            with open(something_img_file_path, "wb") as fp:
                fp.write(img)
            fp.close()
            if img:
                try:
                    something = recognition_image(something_img_file_path)
                    if something:
                        return something
                    else:
                        return ''
                except:
                    return ''
            else:
                return ''
        else:
            return ''

Exemplo n.º 13

0

Exibir arquivo

class YiLingLingYiSanWuShangWuWangSpider(CrawlSpider):
    name = "yllsw"
    allowed_domains = ['100135.com', 'www.100135.com']
    start_urls = ['http://www.100135.com/company.html']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.3,
        'ITEM_PIPELINES': {
            'BigB2BSpider.pipelines.MysqlTwistedPiplines': 302
        },
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            # "Connection": "keep-alive",
            # "Cookie": "Hm_lvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566177749; CCKF_visitor_id_92126=1219631166; yunsuo_session_verify=db1a03528b7dfe197918cf533946c447; bdshare_firstime=1566178685689; Hm_lpvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566178686",
            # "Host": "jamesni139.tybaba.com",
            # "Referer": "http://jamesni139.tybaba.com/",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420,
        },
        # 不验证SSL证书
        "DOWNLOAD_HANDLERS_BASE": {
            'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler',
            'http': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler',
            'https':
            'scrapy.core.downloader.handlers.http.HttpDownloadHandler',
            's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler',
        },
        # "DOWNLOAD_HANDLERS": {
        #     'https': 'BigB2BSpider.custom.downloader.handler.https.HttpsDownloaderIgnoreCNError'},
    }
    # /c3847/p2/
    rules = (
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@class='left sort']//div[@class='SortTitle']//a")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@id='main_left']//dd//div[@class='info_title']//a")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=("//div[@id='pager']//a[contains(text(),'>')]")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@id='leftmenu']//a[contains(text(),'联系方式')]")),
             callback='parse_items',
             follow=False),
    )

    def parse_items(self, response):
        item = YiLingLingYiSanWuShangWuWangItem()
        item["company_Name"] = response.xpath(
            "//td[contains(text(),'公司名称：')]/following-sibling::td/text()"
        ).extract_first()
        item["kind"] = "".join(
            response.xpath("//p[contains(text(),'主营产品：')]//text()").getall())
        item["company_address"] = "".join(
            response.xpath(
                "//td[contains(text(),'地址：')]/following-sibling::td/text()").
            extract())
        item["linkman"] = response.xpath(
            "//td[contains(text(),'联系人：')]/following-sibling::td/text()"
        ).extract_first()
        item["telephone"] = response.xpath(
            "//td[contains(text(),'固定电话：')]/following-sibling::td/text()"
        ).extract_first()
        item["phone"] = response.xpath(
            "//td[contains(text(),'手机：')]/following-sibling::td/text()"
        ).extract_first()
        item["contact_Fax"] = response.xpath(
            "//td[contains(text(),'传真：')]/following-sibling::td/text()"
        ).extract_first()
        item["contact_QQ"] = response.xpath(
            "//img[@title='点击QQ交谈/留言']/../@href").extract_first()
        item["E_Mail"] = response.xpath(
            "//td[contains(text(),'E-mail：')]/following-sibling::td/text()"
        ).extract_first()
        item["Source"] = response.url
        item["province"] = ""
        item["city_name"] = ""

        if item["company_Name"]:
            item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：', '',
                                          item["company_Name"]).replace(
                                              ' ', '').strip()
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"]:
            try:
                item["kind"] = item["kind"].split('主营产品：')[-1]
                if item["kind"]:
                    item["kind"] = item["kind"].replace(' ', '|')
                    item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \
                        .replace(',', '|').replace('，', '|').replace(';','|').replace('.', '').strip()

            except:
                item["kind"] = ''
        else:
            item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

        if item["linkman"]:
            item["linkman"] = item["linkman"]
        else:
            item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"]:
            item["phone"] = self.cw.search_phone_num(item["phone"])
        else:
            item["phone"] = ''

        if item["telephone"]:
            item["telephone"] = self.cw.search_telephone_num(item["telephone"])
        else:
            item["telephone"] = ''

        if item["contact_Fax"]:
            item["contact_Fax"] = self.cw.search_contact_Fax(
                item["contact_Fax"])
        else:
            item["contact_Fax"] = ''

        if item["E_Mail"]:
            item["E_Mail"] = self.cw.search_email(item["E_Mail"])
        else:
            item["E_Mail"] = ''

        if item["contact_QQ"]:
            item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
        else:
            try:
                item["contact_QQ"] = self.cw.search_QQ(item["E_Mail"])
            except:
                item["contact_QQ"] = ''

        if item["company_address"]:
            item["company_address"] = self.cw.search_address(
                item["company_address"])
        else:
            item["company_address"] = ''

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 14

0

Exibir arquivo

Arquivo: kids61.py Projeto: cogitozz/Big_Scrapy_Spider

class TongZhuangQiYeWangSpider(CrawlSpider):
    name = "61kids"
    allowed_domains = ['www.61kids.com.cn']
    start_urls = ['http://www.61kids.com.cn/dressunion/']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0,
        'ITEM_PIPELINES': {
            'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302
        },
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            # "Cache-Control": "no-cache",
            "Connection":
            "keep-alive",
            # "Cookie": "Hm_lvt_7b0dee6397672d912c9cfc5ce2c321d2=1568272622; Hm_lpvt_7b0dee6397672d912c9cfc5ce2c321d2=1568272862; KiDs_member_htc_86506_86506=fc96DkuXLEkLAe6X6-yladuhZt1sZWrWjZhvydEQUw; KiDs_member_htc_416770_416770=013fQYlCVAMyQsG7kIiH6sHxGqtSOAgPu0IAU06W4Q",
            "Host":
            "www.61kids.com.cn",
            # "Pragma": "no-cache",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420,
        },
    }
    # # /c3847/p2/
    rules = (
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=("//div[@class='zy1 t10']//ul//li//a")),
             callback="parse_items",
             follow=True),
        Rule(LinkExtractor(allow=r".*",
                           restrict_xpaths=("//a[contains(text(),'下一页')]")),
             follow=True),
    )

    # def parse(self, response):
    #     a_list = response.xpath("//div[@class='main_content']//ul//li//h4/../@href").getall()
    #     for a in a_list:
    #         print(a)
    #         yield scrapy.Request(
    #             url=a,
    #             callback=self.parse_items,
    #             dont_filter=True
    #         )

    def parse_items(self, response):
        item = TongZhuangQiYeWangItem()
        pattern_c = re.compile(r'<meta name="keywords" content="(.*?)" />',
                               re.S)
        pattern_k = re.compile(r'>主营：(.*?)\s*<', re.S)
        pattern_area = re.compile(r'>\s*公司所在地：(.*?)  (.*?)  <br/>', re.S)
        pattern_add = re.compile(r'>\s*地址：(.*?)<br />', re.S)
        pattern_tp = re.compile(r'>\s*电话：(.*?)<br />', re.S)
        pattern_ph = re.compile(r'>\s*手机：(.*?)<br />', re.S)
        pattern_fx = re.compile(r'>\s*传真：(.*?)<br />', re.S)
        pattern_ad = re.compile(r'>\s*地址：(.*?)<', re.S)
        pattern_add = re.compile(r'址：(.*?)<', re.S)
        # pattern_tp = re.compile(r'\(?0\d{2,3}[)-]?\d{7,8}', re.S)
        pattern_ph = re.compile(r'\(?0\d{2,3}[)-]?\d{7,8}', re.S)
        pattern_fx = re.compile(r'真：(.*?)<', re.S)
        pattern_e = re.compile(r'箱：(.*?)<')

        pattern_em = re.compile(r'(.*?)@163.com', re.S)
        pattern_qq = re.compile(r'(\d+)@qq.com', re.S)
        item["company_Name"] = response.xpath(
            "//div[@class='lb2_12']/a/span/text()").get()
        item["company_address"] = response.xpath(
            "//span[contains(text(),'地　址：')]/following-sibling::p/text()").get(
            )
        item["linkman"] = "".join(
            response.xpath("//li[contains(text(),'联系人：')]/text()").extract())
        item["telephone"] = "".join(
            response.xpath("//span[@class='brand_phone']//text()").getall())
        item["phone"] = "".join(
            re.findall(pattern_ph, response.text)) if re.findall(
                pattern_ph, response.text) else ''
        item["contact_Fax"] = "".join(
            re.findall(pattern_fx, response.text)) if re.findall(
                pattern_fx, response.text) else ''
        item["contact_QQ"] = "".join(
            re.findall(pattern_qq, response.text)) if re.findall(
                pattern_qq, response.text) else ''
        item["E_Mail"] = "".join(
            re.findall(pattern_em, response.text)) if re.findall(
                pattern_em, response.text) else ''
        item["Source"] = response.url
        item["kind"] = response.xpath(
            "//span[contains(text(),'主　营：')]/following-sibling::p/text()").get(
            )
        # city_infos = response.xpath("//td[contains(text(),'所在地区：')]/following-sibling::td/text()").get()
        item["province"] = "".join(
            re.findall(pattern_area, response.text)[0][0]) if re.findall(
                pattern_area, response.text) else ''
        item["city_name"] = "".join(
            re.findall(pattern_area, response.text)[0][1]) if re.findall(
                pattern_area, response.text) else ''

        if item["company_Name"]:
            item["company_Name"] = self.cw.search_company(item["company_Name"])
        else:
            item["company_Name"] = ''
            return
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"] and item["kind"] != '':
            item["kind"] = item["kind"].replace(" ", '|')
            item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营|主营项目：', '', item["kind"]).replace('-', '|')\
                .replace('、', '|').replace(',', '|').replace('，', '|').replace(';','|').replace('.', '').strip()
        else:
            try:
                item["kind"] = "".join(re.findall(
                    pattern_k, response.text)) if re.findall(
                        pattern_k, response.text) else ''
                item["kind"] = item["kind"].replace(" ", '|')
                item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营|主营项目：', '', item["kind"]).replace('-', '|') \
                    .replace('、', '|').replace(',', '|').replace('，', '|').replace(';', '|').replace('.', '').strip()
            except:
                item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

        if item["linkman"]:
            item["linkman"] = self.cw.search_linkman(item["linkman"])
        else:
            try:
                item["linkman"] = "".join(
                    response.xpath("//p[@class='fd_zw']//b//text()").getall())
                item["linkman"] = re.sub(r'\s|\r|\t|\n', '', item["linkman"])
            except:
                item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"]:
            item["phone"] = self.cw.search_phone_num(item["phone"])
        else:
            item["phone"] = ''

        if item["telephone"]:
            item["telephone"] = self.cw.search_telephone_num(item["telephone"])
        else:
            item["telephone"] = ''

        if item["contact_Fax"]:
            item["contact_Fax"] = self.cw.search_contact_Fax(
                item["contact_Fax"])
        else:
            item["contact_Fax"] = ''

        if item["E_Mail"]:
            item["E_Mail"] = self.cw.search_email(item["E_Mail"])
        else:
            try:
                item["E_Mail"] = "".join(re.findall(
                    pattern_e, response.text)) if re.findall(
                        pattern_e, response.text) else ''
            except:
                item["E_Mail"] = ''
        item["E_Mail"] = self.cw.search_email(item["E_Mail"])

        if item["contact_QQ"]:
            item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
        else:
            item["contact_QQ"] = ''

        if item["company_address"]:
            item["company_address"] = item["company_address"].replace(
                "联系地址：", "")
            item["company_address"] = self.cw.search_address(
                item["company_address"])
        else:
            try:
                item["company_address"] = "".join(
                    re.findall(pattern_add, response.text)) if re.findall(
                        pattern_add, response.text) else ''
            except:
                item["company_address"] = ''
        item["company_address"] = self.cw.search_address(
            item["company_address"])

        # if city_infos:
        #     if '/' in city_infos:
        #         try:
        #             item["province"] = city_infos.split('/')[0]
        #             item["city_name"] = city_infos.split('/')[1]
        #         except:
        #             item["province"] = ''
        #             item["city_name"] = ''
        #     else:
        #         item["province"] = ''
        #         item["city_name"] = ''
        # else:
        #     item["province"] = ''
        #     item["city_name"] = ''

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 15

0

Exibir arquivo

class ShiPinDaiLiWangSpider(CrawlSpider):
    name = "spdl"
    allowed_domains = ['spdl.com', 'www.spdl.com']
    start_urls = ['http://www.spdl.com/company/']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.5,
        'ITEM_PIPELINES': {
            'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302
        },
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420,
        },
    }
    # /c3847/p2/
    rules = (
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=("//ul[@class='clearfix key-choice tac']//li//a")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=
            ("//div[@class='main-list']//li[@class='clearfix']//a[contains(text(),'联系方式')]"
             )),
             callback='parse_items',
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@class='newpage']//a[contains(text(),'下一页')]")),
             follow=True),
    )

    def parse_items(self, response):
        item = ShiPinDaiLiWangItem()
        if "浏览量：" or "展位会员" in response.text:
            pattern_l = re.compile(
                r'<div class="mobile-kf">\s*<span>联系人:(.*?)</span>', re.S)
            item["company_Name"] = response.xpath(
                "//td[contains(text(),'公司名称：')]//a/text()").get()
            item["kind"] = ",".join(
                response.xpath(
                    "//td[@align='center']//span//a/text()").getall())
            item["linkman"] = "".join(re.findall(
                pattern_l, response.text)) if re.findall(
                    pattern_l, response.text) else ''
            item["company_address"] = "".join(
                response.xpath(
                    "//li[contains(text(),'地 址:')]//text()").getall())
            item["telephone"] = "".join(
                response.xpath(
                    "//li[@class='haoma']//span[@class='linkstr']//text()").
                getall())
            item["phone"] = "".join(
                response.xpath(
                    "//li[contains(text(),'手 机: ')]//span/a//text()").getall())
            item["contact_Fax"] = "".join(
                response.xpath(
                    "//li[contains(text(),'传 真: ')]//span//text()").getall())
            item["contact_QQ"] = "".join(
                response.xpath(
                    "//li[contains(text(),'Q:')]//span//text()").getall())
            item["E_Mail"] = "".join(
                response.xpath(
                    "//li[contains(text(),'邮 箱: ')]//span//text()").getall())
            item["Source"] = response.url
            item["province"] = ''
            item["city_name"] = ''

            if item["company_Name"] and item["company_Name"] != '':
                if "（" in item["company_Name"]:
                    item["company_Name"] = item["company_Name"].split('（')[0]
                elif "(" in item["company_Name"]:
                    item["company_Name"] = item["company_Name"].split('(')[0]
                elif "_" in item["company_Name"]:
                    item["company_Name"] = item["company_Name"].split('_')[0]
                elif "-" in item["company_Name"]:
                    item["company_Name"] = item["company_Name"].split('-')[0]
                else:
                    item["company_Name"] = re.sub(
                        r'\n|\s|\r|\t|公司名称：', '',
                        item["company_Name"]).replace(' ', '').strip()
            else:
                return
            item["company_id"] = self.get_md5(item["company_Name"])

            if item["kind"]:
                item["kind"] = item["kind"].replace(" ", '|')
                item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营|主营项目：', '', item["kind"]).replace('-', '|')\
                    .replace('、', '|').replace(',', '|').replace('，', '|').replace(';','|').replace('.', '').strip()
            else:
                item["kind"] = ''

            item["kind"] = self.cw.rinse_keywords(
                self.cw.replace_ss(item["kind"]))

            if item["linkman"]:
                item["linkman"] = item["linkman"].replace(
                    '联 系 人:', '').replace('请您点击留言，留言后将显示联系方式！', '')
            else:
                item["linkman"] = ''
            item["linkman"] = self.cw.search_linkman(item["linkman"])

            if item["phone"]:
                item["phone"] = item["phone"].replace('请您点击留言，留言后将显示联系方式！', '')
                item["phone"] = self.cw.search_phone_num(item["phone"])
            else:
                item["phone"] = ''

            if item["telephone"]:
                item["telephone"] = item["telephone"].replace(
                    '请您点击留言，留言后将显示联系方式！', '')
                item["telephone"] = self.cw.search_telephone_num(
                    item["telephone"])
            else:
                item["telephone"] = ''

            if item["contact_Fax"]:
                item["contact_Fax"] = item["contact_Fax"].replace(
                    '请您点击留言，留言后将显示联系方式！', '')
                item["contact_Fax"] = self.cw.search_contact_Fax(
                    item["contact_Fax"])
            else:
                item["contact_Fax"] = ''

            if item["E_Mail"]:
                item["E_Mail"] = item["E_Mail"].replace(
                    '请您点击留言，留言后将显示联系方式！', '')
                item["E_Mail"] = self.cw.search_email(item["E_Mail"])
            else:
                item["E_Mail"] = ''

            if item["contact_QQ"]:
                item["contact_QQ"] = item["contact_QQ"].replace(
                    '请您点击留言，留言后将显示联系方式！', '')
                item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
            else:
                try:
                    item["contact_QQ"] = self.cw.search_email(item["E_Mail"])
                except:
                    item["contact_QQ"] = ''

            if item["company_address"]:
                item["company_address"] = item["company_address"].replace('请您点击留言，留言后将显示联系方式！','')\
                    .replace('地 址:','')
            else:
                item["company_address"] = ''
            item["company_address"] = self.cw.search_address(
                item["company_address"])

            yield item

        else:
            pattern = re.compile(
                r'<meta name="keywords" content=".*?,(.*?)"/>', re.S)
            pattern_area = re.compile(
                r";\s*map.Address = '(.*?),(.*?),.*?,,';", re.S)
            pattern_k = re.compile(r'<p>食品代理网-专业的(.*?)服务平台。</p>', re.S)
            item["company_Name"] = response.xpath(
                "//p[contains(text(),'公司名称：')]//a/text()").extract_first()
            item["company_address"] = response.xpath(
                "//div[@id='contactleft']//p[contains(text(),'地址：')]/text()"
            ).extract_first()
            item["linkman"] = response.xpath(
                "//div[@class='zs-phone fr']//span/h4/text()").extract_first()
            item["telephone"] = response.xpath(
                "//div[@class='zs-phone fr']//span/p/text()").extract_first()
            item["phone"] = response.xpath(
                "//td[contains(text(),'手机号码：')]/following-sibling::td/text()"
            ).extract_first()
            item["contact_Fax"] = response.xpath(
                "//td[contains(text(),'公司传真：')]/following-sibling::td/text()"
            ).extract_first()
            item["contact_QQ"] = response.xpath(
                "//img[@title='点击QQ交谈/留言']/../@href").extract_first()
            item["E_Mail"] = response.xpath(
                "//p[contains(text(),'邮箱：')]/text()").extract_first()
            item["Source"] = response.url
            item["kind"] = ",".join(re.findall(
                pattern_k, response.text)) if re.findall(
                    pattern_k, response.text) else ''
            # city_infos = response.xpath("//dt[contains(text(),'所在地区：')]/following-sibling::dd/text()").get()
            item["province"] = "".join(
                re.findall(pattern_area, response.text)[0][0]) if re.findall(
                    pattern_area, response.text) else ''
            item["city_name"] = "".join(
                re.findall(pattern_area, response.text)[0][1]) if re.findall(
                    pattern_area, response.text) else ''

            if item["company_Name"] and item["company_Name"] != '':
                if "（" in item["company_Name"]:
                    item["company_Name"] = item["company_Name"].split('（')[0]
                elif "(" in item["company_Name"]:
                    item["company_Name"] = item["company_Name"].split('(')[0]
                elif "_" in item["company_Name"]:
                    item["company_Name"] = item["company_Name"].split('_')[0]
                elif "-" in item["company_Name"]:
                    item["company_Name"] = item["company_Name"].split('-')[0]
                else:
                    item["company_Name"] = re.sub(
                        r'\n|\s|\r|\t|公司名称：', '',
                        item["company_Name"]).replace(' ', '').strip()
            else:
                return
            item["company_id"] = self.get_md5(item["company_Name"])

            if item["kind"]:
                item["kind"] = item["kind"].replace(" ", '|')
                item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营|主营项目：', '', item["kind"]).replace('-', '|')\
                    .replace('、', '|').replace(',', '|').replace('，', '|').replace(';','|').replace('.', '').strip()
            else:
                item["kind"] = ''

            item["kind"] = self.cw.rinse_keywords(
                self.cw.replace_ss(item["kind"]))

            if item["linkman"]:
                item["linkman"] = item["linkman"].replace('联 系 人:', '')
            else:
                item["linkman"] = ''
            item["linkman"] = self.cw.search_linkman(item["linkman"])

            if item["phone"]:
                item["phone"] = self.cw.search_phone_num(item["phone"])
            else:
                item["phone"] = ''

            if item["telephone"]:
                item["telephone"] = self.cw.search_telephone_num(
                    item["telephone"])
            else:
                item["telephone"] = ''

            if item["contact_Fax"]:
                item["contact_Fax"] = self.cw.search_contact_Fax(
                    item["contact_Fax"])
            else:
                item["contact_Fax"] = ''

            if item["E_Mail"]:
                item["E_Mail"] = self.cw.search_email(item["E_Mail"])
            else:
                item["E_Mail"] = ''

            if item["contact_QQ"]:
                item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
            else:
                try:
                    item["contact_QQ"] = self.cw.search_email(item["E_Mail"])
                except:
                    item["contact_QQ"] = ''

            if item["company_address"]:
                item["company_address"] = self.cw.search_address(
                    item["company_address"])
            else:
                item["company_address"] = ''

            # if city_infos:
            #     if '/' in city_infos:
            #         try:
            #             item["province"] = city_infos.split('/')[0]
            #             item["city_name"] = city_infos.split('/')[1]
            #         except:
            #             item["province"] = ''
            #             item["city_name"] = ''
            #     else:
            #         item["province"] = ''
            #         item["city_name"] = ''
            # else:
            #     item["province"] = ''
            #     item["city_name"] = ''

            yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 16

0

Exibir arquivo

class YiShangWangSpider(CrawlSpider):
    name = "esw"
    allowed_domains = ['www.esw.com.cn']
    start_urls = ['http://www.esw.com.cn/company/default.aspx?page=1']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.5,
        'ITEM_PIPELINES': {
            'BigB2BSpider.pipelines.MysqlTwistedPiplines': 302
        },
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            # "Connection": "keep-alive",
            # "Cookie": "Hm_lvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566177749; CCKF_visitor_id_92126=1219631166; yunsuo_session_verify=db1a03528b7dfe197918cf533946c447; bdshare_firstime=1566178685689; Hm_lpvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566178686",
            # "Host": "jamesni139.tybaba.com",
            # "Referer": "http://jamesni139.tybaba.com/",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420,
        },
        # 不验证SSL证书
        # "DOWNLOAD_HANDLERS_BASE": {
        #     'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler',
        #     'http': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler',
        #     'https': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler',
        #     's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler',
        # },
        # "DOWNLOAD_HANDLERS": {
        #     'https': 'BigB2BSpider.custom.downloader.handler.https.HttpsDownloaderIgnoreCNError'},
    }
    # /c3847/p2/
    rules = (
        Rule(LinkExtractor(
            allow=r"\/member\/.*?\.html",
            restrict_xpaths=(
                "//div[@class='nt_left']//div[@class='yllist']//ul//li//span//a"
            )),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@id='AspNetPager1']//a[contains(text(),'下一页')]")),
             follow=True),
        Rule(LinkExtractor(allow=r"\/member\/contact\d+\.html",
                           restrict_xpaths=("///a[contains(text(),'联系我们')]")),
             callback='parse_items',
             follow=True),
    )

    def parse_items(self, response):
        pattern = re.compile(r'<META name=keywords content=(.*?),.*?>', re.S)
        pattern1 = re.compile(r'<p>\s*联系人：<span id="Span3">(.*?)</span></p>',
                              re.S)
        pattern2 = re.compile(r'<p>\s*手机：<span id="Span2">(.*?)</span></p>',
                              re.S)
        pattern3 = re.compile(r'<p>\s*电话：<span id="x_tel">(.*?)</span></p>',
                              re.S)
        pattern4 = re.compile(r'<p>\s*传真：<span id="x_fax">(.*?)</span></p>',
                              re.S)
        pattern5 = re.compile(r'</p>\s*地址:<span id="x_address">(.*?)<span>',
                              re.S)
        pattern6 = re.compile(r'<META name=keywords content=.*?,(.*?)>', re.S)
        item = YiShangWangItem()
        item["company_Name"] = response.xpath(
            "//div[contains(text(),'公司全称：')]/following-sibling::div/text()"
        ).get()
        item["kind"] = response.xpath(
            "//div[contains(text(),'主营业务： ')]/following-sibling::div/text()"
        ).get()
        item["company_address"] = response.xpath(
            "//div[contains(text(),'址：')]/following-sibling::div/text()").get(
            )
        item["linkman"] = response.xpath(
            "//div[contains(text(),'联系人')]/following-sibling::div/text()").get(
            )
        item["telephone"] = response.xpath(
            "//div[contains(text(),'话：')]/following-sibling::div/text()").get(
            )
        item["phone"] = response.xpath(
            "//div[contains(text(),'联系人')]/following-sibling::div/text()").get(
            )
        item["contact_Fax"] = response.xpath(
            "//div[contains(text(),'话：')]/following-sibling::div/text()").get(
            )
        item["contact_QQ"] = ''
        item["E_Mail"] = response.xpath(
            "//div[contains(text(),'邮')]/following-sibling::div/a/text()").get(
            )
        item["Source"] = response.url
        item["province"] = ""
        item["city_name"] = ""

        if item["company_Name"]:
            item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：|企 业 名 称：', '',
                                          item["company_Name"]).replace(
                                              ' ', '').strip()
        else:
            try:
                item["company_Name"] = "".join(
                    re.findall(pattern, response.text)) if re.findall(
                        pattern, response.text) else ''
                item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：|企 业 名 称：',
                                              '',
                                              item["company_Name"]).replace(
                                                  ' ', '').strip()
            except:
                item["company_Name"] = ''
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"]:
            item["kind"] = item["kind"].replace(' ', '|')
            item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \
                .replace(',', '|').replace('，', '|').replace(';','|').replace('.', '').strip()
        else:
            try:
                item["kind"] = "".join(re.findall(
                    pattern6, response.text)) if re.findall(
                        pattern6, response.text) else ''
                item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \
                    .replace(',', '|').replace('，', '|').replace(';', '|').replace('.', '').strip()
            except:
                item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

        if item["linkman"]:
            try:
                # '唐总   13533003050(手机)  '
                item["linkman"] = item["linkman"].split(' ')[0]
            except:
                item["linkman"] = ''
        else:
            try:
                item["linkman"] = "".join(re.findall(
                    pattern1, response.text)) if re.findall(
                        pattern1, response.text) else ''
            except:
                item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"] and "手机" in item["phone"]:
            pattern = re.compile(r'(\d+)', re.S)
            item["phone"] = re.sub(r'\s|\n|\r|\t', '', item["phone"])
            item["phone"] = "".join(re.findall(pattern, item["phone"]))
            item["phone"] = self.cw.search_phone_num(item["phone"])
        else:
            try:
                item["phone"] = "".join(re.findall(
                    pattern2, response.text)) if re.findall(
                        pattern2, response.text) else ''
            except:
                item["phone"] = ''

        if item["telephone"]:
            try:
                item["telephone"] = item["telephone"].split(' ')[0]
                item["telephone"] = self.cw.search_telephone_num(
                    item["telephone"])
            except:
                item["telephone"] = ''
        else:
            try:
                item["telephone"] = "".join(re.findall(
                    pattern3, response.text)) if re.findall(
                        pattern3, response.text) else ''
            except:
                item["telephone"] = ''

        if item["contact_Fax"] and "传真：" in item["contact_Fax"]:
            # '020-81633545   传真：020-81633545'
            try:
                item["contact_Fax"] = item["contact_Fax"].split('传真：')[-1]
                item["contact_Fax"] = self.cw.search_contact_Fax(
                    item["contact_Fax"])
            except:
                item["contact_Fax"] = ''
        else:
            try:
                item["contact_Fax"] = "".join(
                    re.findall(pattern4, response.text)) if re.findall(
                        pattern4, response.text) else ''
            except:
                item["contact_Fax"] = ''

        if item["E_Mail"]:
            item["E_Mail"] = self.cw.search_email(item["E_Mail"])
        else:
            item["E_Mail"] = ''

        if item["E_Mail"]:
            item["contact_QQ"] = self.cw.search_QQ(item["E_Mail"])
        else:
            item["contact_QQ"] = ''

        if item["company_address"]:
            item["company_address"] = self.cw.search_address(
                item["company_address"])
        else:
            try:
                item["company_address"] = "".join(
                    re.findall(pattern5, response.text)) if re.findall(
                        pattern5, response.text) else ''
            except:
                item["company_address"] = ''

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 17

0

Exibir arquivo

Arquivo: ksb.py Projeto: cogitozz/Big_Scrapy_Spider

class KuSoBaSpider(CrawlSpider):
    name = "ksb"
    allowed_domains = ['www.kusoba.com']
    start_urls = ['http://www.kusoba.com/']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.3,
        'ITEM_PIPELINES': {'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302},
        'DEFAULT_REQUEST_HEADERS': {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            # "Host": "www.kusoba.com",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543,
        }
    }
    # /c3847/p2/
    rules = (
        Rule(LinkExtractor(
            allow=r"/.*/",restrict_xpaths=("//div[@class='beij_center']//div[@class='wenzlieb']//a")), follow=True),

        Rule(LinkExtractor(
            allow=r"/company/\d+.html", restrict_xpaths=("//div[@class='beij_center']//div[@class='nianmf']//a")),
            callback='parse_items', follow=False),

        Rule(LinkExtractor(allow=r"/.*/p.*/", restrict_xpaths=("//div[@class='fanye']//a")), follow=True),
    )

    def parse_items(self, response):
        # print(response.text)
        pattern = re.compile(r'<li><p>联系我们：</p><span>(.*?)</span></li>',re.S)
        pattern1 = re.compile(r'<li><p>电<em></em>话：</p><span>(.*?)</span></li>', re.S)
        pattern2 = re.compile(r'<li><p>移动电话：</p><span>(.*?)</span></li>', re.S)
        pattern3 = re.compile(r'<li><p>传<em></em>真：</p><span>(.*?)</span></li>', re.S)
        item = KuSoBaspiderItem()
        item["company_Name"] = "".join(response.xpath("//div[contains(text(),'全称：')]/text()").extract())
        item["kind"] = "".join(response.xpath("//div[contains(text(),'主营产品：')]/text()").extract())
        item["company_address"] = "".join(response.xpath("//div[contains(text(),'注册地址：')]/text()").extract())
        item["linkman"] = "".join(response.xpath("//div[contains(text(),'法定代表人：')]/text()").extract())
        item["telephone"] = "".join(re.findall(pattern1,response.text)) if re.findall(pattern1,response.text) else ''
        item["phone"] = "".join(re.findall(pattern2,response.text)) if re.findall(pattern2,response.text) else ''
        item["contact_Fax"] = "".join(re.findall(pattern3,response.text)) if re.findall(pattern3,response.text) else ''
        item["contact_QQ"] = ""
        item["E_Mail"] = ""
        item["Source"] = response.url
        item["province"] = ""
        item["city_name"] = ""

        if item["company_Name"]:
            item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：|全称：', '', item["company_Name"]).replace(' ', '').strip()
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"]:
            item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营|主营产品：', '', item["kind"]).replace('-', '|').replace('、', '|') \
                .replace('，', '|').replace('，', '|').replace('.', '').strip()
        else:
            item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

        if item["linkman"]:
            if "（" in item["linkman"]:
                item["linkman"] = item["linkman"].split("（")[0].replace('法定代表人：','').replace('暂未公布','')
            else:
                item["linkman"] = item["linkman"].replace('法定代表人：','').replace('暂未公布','')
        else:
            item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"]:
            item["phone"] = self.cw.search_phone_num(item["phone"])
        else:
            item["phone"] = ''

        if item["telephone"]:
            item["telephone"] = self.cw.search_telephone_num(item["telephone"])
        else:
            item["telephone"] = ''

        if item["contact_Fax"]:
            item["contact_Fax"] = self.cw.search_contact_Fax(item["contact_Fax"])
        else:
            item["contact_Fax"] = ''

        if item["E_Mail"]:
            item["E_Mail"] = self.cw.search_email(item["E_Mail"])
        else:
            item["E_Mail"] = ''

        if item["contact_QQ"]:
            item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
        else:
            item["contact_QQ"] = ''

        if item["company_address"]:
            item["company_address"] = item["company_address"].replace('注册地址：','')
            item["company_address"] = self.cw.search_address(item["company_address"])
        else:
            item["company_address"] = ''

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 18

0

Exibir arquivo

Arquivo: diytrade.py Projeto: cogitozz/Big_Scrapy_Spider

class ZiZhuMaoYiWangSpider(CrawlSpider):
    # 自助贸易网
    name = 'diytrade'
    allowed_domains = ['cn.diytrade.com','diytrade.com']
    start_urls = ['https://cn.diytrade.com/china/main.html']
    cw = CleanWords()

    custom_settings = {
        'DOWNLOAD_DELAY': 0.3,  # 延时最低为2s
        # 'AUTOTHROTTLE_ENABLED': True,  # 启动[自动限速]
        # 'AUTOTHROTTLE_DEBUG': True,  # 开启[自动限速]的debug
        # 'AUTOTHROTTLE_MAX_DELAY': 10,  # 设置最大下载延时
        'DOWNLOAD_TIMEOUT': 5, #设置下载超时
        'CONCURRENT_REQUESTS_PER_DOMAIN': 5, # 限制对该网站的并发请求数
        'ITEM_PIPELINES': {'BigB2BSpider.pipelines.MysqlTwistedPiplines': 302},
        'DEFAULT_REQUEST_HEADERS': {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            # "Cookie": "Hm_lvt_34c005d4caf30d75012a05867beca619=1562747829,1564973957; Hm_lpvt_34c005d4caf30d75012a05867beca619=1564974049",
            # "Host": "b2b.huishangbao.com",
            # "Referer": "http://b2b.huishangbao.com/",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543,
            # 'BigB2BSpider.middlewares.ProcessAllExceptionMiddleware': 120,
        }
    }

    def parse(self, response):
        a_list = response.xpath("//div[@class='prodCatListDIV']//ul[@class='prodCatList']//li//a")
        for a in a_list:
            kind_name = a.xpath("./text()").extract_first()
            kind_href = a.xpath("./@href").extract_first()
            if kind_href:
                if kind_href.startswith("http://"):
                    kind_href = kind_href
                else:
                    kind_href = "https://cn.diytrade.com" + kind_href
                # print(kind_name,kind_href)
                yield scrapy.Request(
                    url=kind_href,
                    callback=self.parse_kind_list,
                    dont_filter=True
                )

    def parse_kind_list(self, response):
        s_kind_href = response.xpath("//a[contains(text(),'» 公司信息')]/@href").extract_first()
        if s_kind_href:
            s_kind_href = "https://cn.diytrade.com" + s_kind_href
            yield scrapy.Request(
                url=s_kind_href,
                callback=self.parse_company_list,
                dont_filter=True
            )

    def parse_company_list(self, response):
        div_list = response.xpath("//form[@name='itemForm']//ul[@class='comItems']//li")
        for div in div_list:
            item = ZiZhuMaoYiWangspiderItem()
            item["company_Name"] = div.xpath(".//div[@class='col3']/h3/a/text()").extract_first()
            company_href = div.xpath(".//div[@class='col3']/h3/a/@href").extract_first()
            if company_href:
                # print(company_href)
                company_href = "https://cn.diytrade.com" + company_href
                # print(contact_href)
                yield scrapy.Request(
                    url=company_href,
                    callback=self.parse_company_contact,
                    meta={"item": item},
                    dont_filter=True
                )

        next_page_url = response.xpath("//div[@class='clearfix pageNavList']//a[contains(text(),'下一页')]/@href").extract_first()
        if next_page_url:
            next_page_url = "https://cn.diytrade.com" + next_page_url
            yield scrapy.Request(
                url=next_page_url,
                callback=self.parse_company_list
            )

    # def parse_company_detail(self, response):
    #     item = response.meta["item"]
    #     contact_href = response.xpath("//li//a[contains(text(),'联系我们')]/@href").extract_first()
    #     if contact_href:
    #         yield scrapy.Request(
    #             url=contact_href,
    #             callback=self.parse_company_contact,
    #             meta={"item": item},
    #             dont_filter=True
    #         )


    def parse_company_contact(self, response):
        item = response.meta["item"]
        pattern = re.compile(r'uin=(.*?)&',re.S)
        # item["company_Name"] = response.xpath("//th[contains(text(),'公司名称︰')]/following-sibling::td/text()").extract_first()
        item["kind"] = response.xpath("//th[contains(text(),'主营行业︰')]/following-sibling::td/h3/text()").extract_first()
        item["company_address"] = response.xpath("//th[contains(text(),'地址︰')]/following-sibling::td/text()").extract_first()
        item["linkman"] = "".join(response.xpath("//th[contains(text(),'联系人︰')]/following-sibling::td/text()").extract())
        item["telephone"] = "".join(response.xpath("//th[contains(text(),'电话︰')]/following-sibling::td/text()").extract())
        item["phone"] = "".join(response.xpath("//th[contains(text(),'手机︰')]/following-sibling::td/text()").extract())
        item["E_Mail"] = "".join(response.xpath("//th[contains(text(),'公司邮箱︰')]/following-sibling::td/text()").extract())
        item["contact_Fax"] = response.xpath("//th[contains(text(),'传真︰')]/following-sibling::td/text()").extract_first()
        item["contact_QQ"] = "".join(response.xpath("//img[@title='点击这里给我发消息']/../@href").extract())
        item["Source"] = response.url

        city_infos = response.xpath("//th[contains(text(),'国家/地区︰')]/following-sibling::td/h3/text()").extract_first()
        if city_infos:
            pattern1 = re.compile(r'(.*?)省(.*?)市',re.S)
            # 广东/潮州市
            # 广东省深圳市
            try:
                item["province"] = re.findall(pattern1,city_infos)[0][0]
                item["city_name"] = re.findall(pattern1,city_infos)[0][1]
            except:
                item["province"] = ''
                item["city_name"] = ''
        else:
            item["province"] = ''
            item["city_name"] = ''

        if item["company_Name"]:
            item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：|（个人账号）', '', item["company_Name"]).replace(' ', '').strip()
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"]:
            item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营', '', item["kind"]).replace(' ', '|').replace('-', '|')\
                .replace('、', '|').replace('，', '|').replace('，', '|').replace('.', '').strip()
        else:
            item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

        if item["linkman"]:
            item["linkman"] = item["linkman"]
        else:
            item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"]:
            item["phone"] = item["phone"]
        else:
            item["phone"] = ''
        item["phone"] = self.cw.search_phone_num(item["phone"])

        if item["telephone"]:
            item["telephone"] = item["telephone"]
        else:
            item["telephone"] = ''
        item["telephone"] = self.cw.search_telephone_num(item["telephone"])

        if item["contact_Fax"]:
            item["contact_Fax"] = item["contact_Fax"]
        else:
            item["contact_Fax"] = ''
        item["contact_Fax"] = self.cw.search_contact_Fax(item["contact_Fax"])

        if item["contact_QQ"]:
            item["contact_QQ"] = "".join(re.findall(pattern,item["contact_QQ"])) if re.findall(pattern,item["contact_QQ"]) else ''
        else:
            item["contact_QQ"] = ''
        item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])

        if item["E_Mail"]:
            item["E_Mail"] = self.cw.search_email(item["E_Mail"])
        else:
            item["E_Mail"] = (item["contact_QQ"] + "@qq.com") if item["contact_QQ"] else ''

        if item["company_address"]:
            item["company_address"] = item["company_address"].replace(",","").replace('，', '|').strip()
        else:
            item["company_address"] = ''
        item["company_address"] = self.cw.search_address(item["company_address"])

        # if item["host_href"]:
        #     item["host_href"] = item["host_href"]
        # else:
        #     item["host_href"] = ''

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 19

0

Exibir arquivo

class ShangLuWang(CrawlSpider):
    name = 'shl'
    allowed_domains = ['www.b2b6.com']
    start_urls = ['http://www.b2b6.com']
    cw = CleanWords()

    custom_settings = {
        'DOWNLOAD_DELAY': 0.2,
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control":
            "max-age=0",
            "Connection":
            "keep-alive",
            # "Cookie": "__jsluid_h=5ce7ed337548f20c7a26c70933fc9b8a; UM_distinctid=16c6b51bdb62b1-0f185a00dd70c9-5a13331d-1fa400-16c6b51bdb7460; CNZZDATA4872360=cnzz_eid%3D973033440-1565166565-http%253A%252F%252Fwww.b2b6.com%252F%26ntime%3D1565166565",
            # "Host": "www.b2b6.com",
            # "Referer": "http://www.b2b6.com/yp/h1f3s0c0p10.aspx",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 543,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543,
        }
    }

    def parse(self, response):
        a_list = response.xpath("//div[@id='dMain']//div[@class='mt']//a")
        for a in a_list:
            kind_name = a.xpath("./text()").extract_first()
            kind_href = a.xpath("./@href").extract_first()
            if kind_href:
                kind_href = "http://www.b2b6.com" + kind_href
                # print(kind_name,kind_href)
                yield scrapy.Request(url=kind_href,
                                     callback=self.parse_kind_list,
                                     dont_filter=True)

    def parse_kind_list(self, response):
        a_list = response.xpath(
            "//div[@id='dMain']//div[@id='dCatalogueBox']//ul//li//a")
        for a in a_list:
            # item["kind"] = a.xpath("./@title").extract_first()
            kind_href = a.xpath("./@href").extract_first()
            if kind_href:
                # print(item["kind"],kind_href)
                kind_href = "http://www.b2b6.com" + kind_href
                yield scrapy.Request(url=kind_href,
                                     callback=self.parse_company_list,
                                     dont_filter=True)

    def parse_company_list(self, response):
        kinds = "".join(
            response.xpath(
                "//div[@id='dNavBox']//a[contains(text(),'首页')]/..//text()").
            extract())
        # if kinds and "市" in kinds:
        #     try:
        #         # '商录分享目录首页 > 天津市 > 综合性行业'
        #         kinds = re.sub(r'\s|\n|r|\t','',kinds).replace(' ','')
        #         kinds = kinds.split('市>')[-1]
        #     except:
        #         kinds = ''
        # else:
        #     kinds = ''
        div_list = response.xpath("//div[@id='dMain']//div[@id='dMainBox']")
        for div in div_list:
            item = ShangLuWangspiderItem()
            # pattern = re.compile(r'(.*?)\/(.*?)', re.S)
            item["company_Name"] = div.xpath(".//a/text()").extract_first()
            item["company_address"] = div.xpath(
                ".//span[@class='addr']/text()").extract_first()
            company_href = div.xpath(".//a/@href").extract_first()
            item["province"] = ''
            item["city_name"] = ''
            item["kind"] = ''
            if company_href:
                # print(company_href)
                contact_href = "http://www.b2b6.com" + company_href
                # print(contact_href)
                yield scrapy.Request(url=contact_href,
                                     callback=self.parse_company_contact,
                                     meta={"item": item},
                                     dont_filter=True)

        next_page_url = response.xpath(
            "//div[@id='dMain']//div[@id='dMainBox']//b[contains(text(),'下一页')]/../@href"
        ).extract_first()
        if next_page_url:
            next_page_url = "http://www.b2b6.com" + next_page_url
            yield scrapy.Request(url=next_page_url,
                                 callback=self.parse_company_list)

    def parse_company_contact(self, response):
        item = response.meta["item"]
        pattern = re.compile(r'<b>公司名称： </b> (.*?)<br />', re.S)
        pattern1 = re.compile(r'<b>联系电话：</b> (.*?)<br />', re.S)
        pattern2 = re.compile(r'<b>公司地址：</b> (.*?)<br />', re.S)
        pattern3 = re.compile(r'<b>经营范围：</b> (.*?)<br />', re.S)
        pattern4 = re.compile(
            r'<b>网站网址：</b> <a  target=_blank  href=".*?" >(.*?)</a><br />',
            re.S)
        pattern5 = re.compile(r'<b>经济行业：</b> <a href=.*?>(.*?)</a><br />',
                              re.S)

        item["company_Name"] = "".join(re.findall(
            pattern, response.text)) if response.text else ''
        item["company_address"] = "".join(re.findall(
            pattern2, response.text)) if response.text else ''
        item["linkman"] = ''
        item["telephone"] = "".join(re.findall(
            pattern1, response.text)) if response.text else ''
        item["phone"] = ''
        item["E_Mail"] = ''
        item["contact_Fax"] = ''
        item["contact_QQ"] = ''
        item["Source"] = response.url
        kinds = response.xpath(
            "//div[@id='dNavBox']/div/a[3]/text()").extract_first()

        if item["company_Name"]:
            item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：', '',
                                          item["company_Name"]).replace(
                                              ' ', '').strip()
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"]:

            item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营产品：|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \
                .replace('，', '|').replace('，', '|').replace('.', '').strip().lstrip('|')
        else:
            item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(kinds))
        if item["kind"]:
            try:
                item["kind"] = item["kind"].split('</')[0]
            except:
                item["kind"] = ''
        else:
            item["kind"] = ''

        if item["linkman"]:
            item["linkman"] = item["linkman"]
        else:
            item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"]:
            item["phone"] = self.cw.search_phone_num(item["phone"])
        else:
            item["phone"] = ''

        if item["telephone"]:
            item["telephone"] = self.cw.search_telephone_num(item["telephone"])
        else:
            item["telephone"] = ''

        if item["contact_Fax"]:
            item["contact_Fax"] = self.cw.search_contact_Fax(
                item["contact_Fax"])
        else:
            item["contact_Fax"] = ''

        if item["E_Mail"]:
            item["E_Mail"] = item["E_Mail"]
        else:
            item["E_Mail"] = ''

        if item["contact_QQ"]:
            item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
        else:
            item["contact_QQ"] = ''

        if item["company_address"]:
            item["company_address"] = self.cw.search_address(
                item["company_address"])
        else:
            item["company_address"] = ''

        # if item["host_href"]:
        #     item["host_href"] = item["host_href"]
        # else:
        #     item["host_href"] = ''

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

    def requests_href(self, url, headers):
        res = requests.get(url=url, headers=headers, timeout=20, verify=False)
        res.encoding = "utf-8"
        if res.status_code == requests.codes.ok:
            img = res.content
            something_img_file_path = r"F:\PythonProjects\venv\pythonProjects\BigB2BSpider\BigB2BSpider\img_src\something_img\image.png"
            with open(something_img_file_path, "wb") as fp:
                fp.write(img)
            fp.close()
            if img:
                try:
                    something = recognition_image(something_img_file_path)
                    if something:
                        return something
                    else:
                        return ''
                except:
                    return ''
            else:
                return ''
        else:
            return ''

Exemplo n.º 20

0

Exibir arquivo

class QuanQiuTieYiWangSpider(CrawlSpider):
    name = "qqtyw"
    allowed_domains = ['tybaba.com', 'www.tybaba.com']
    start_urls = ['http://www.tybaba.com/company/']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.5,
        'ITEM_PIPELINES': {
            'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302
        },
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Connection":
            "keep-alive",
            "Cookie":
            "Hm_lvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566177749; CCKF_visitor_id_92126=1219631166; yunsuo_session_verify=db1a03528b7dfe197918cf533946c447; bdshare_firstime=1566178685689; Hm_lpvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566178686",
            # "Host": "jamesni139.tybaba.com",
            # "Referer": "http://jamesni139.tybaba.com/",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543,
        }
    }
    # /c3847/p2/
    rules = (
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@class='nav-sub']//li[@class='mod_cate']//dl//dt//a")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@class='listnew']//td[@align='left']//li[1]//a")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@class='pages']//a[contains(text(),'»')]")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@id='menu']//span[contains(text(),'联系方式')]/..")),
             callback='parse_items',
             follow=False),
    )

    def parse_items(self, response):
        pattern = re.compile(r'<meta name="keywords" content="(.*?)"/>', re.S)
        item = QuanQiuTieYiWangspiderItem()
        item["company_Name"] = response.xpath(
            "//td[contains(text(),'公司名称：')]/following-sibling::td/text()"
        ).extract_first()
        item["company_address"] = "".join(
            response.xpath(
                "//td[contains(text(),'公司地址：')]/following-sibling::td/text()").
            extract())
        item["linkman"] = response.xpath(
            "//td[contains(text(),'联 系 人：')]/following-sibling::td/text()"
        ).extract_first()
        item["telephone"] = response.xpath(
            "//td[contains(text(),'公司电话：')]/following-sibling::td/text()"
        ).extract_first()
        item["phone"] = response.xpath(
            "//td[contains(text(),'手机号码：')]/following-sibling::td/text()"
        ).extract_first()
        item["contact_Fax"] = response.xpath(
            "//td[contains(text(),'公司传真：')]/following-sibling::td/text()"
        ).extract_first()
        item["contact_QQ"] = response.xpath(
            "//td[contains(text(),'即时通讯：')]/following-sibling::td/a/@href"
        ).extract_first()
        item["E_Mail"] = response.xpath(
            "//td[contains(text(),'电子邮件：')]/following-sibling::td/text()"
        ).extract_first()
        item["kind"] = "".join(re.findall(pattern,
                                          response.text)) if re.findall(
                                              pattern, response.text) else ''
        item["Source"] = response.url
        item["province"] = ''
        item["city_name"] = ''

        if item["company_Name"]:
            item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：', '',
                                          item["company_Name"]).replace(
                                              ' ', '').strip()
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"]:
            item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \
                .replace(',', '|').replace('，', '|').replace('.', '').strip()
        else:
            item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

        if item["linkman"]:
            item["linkman"] = item["linkman"]
        else:
            item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"]:
            item["phone"] = self.cw.search_phone_num(item["phone"])
        else:
            item["phone"] = ''

        if item["telephone"]:
            item["telephone"] = self.cw.search_telephone_num(item["telephone"])
        else:
            item["telephone"] = ''

        if item["contact_Fax"]:
            item["contact_Fax"] = self.cw.search_contact_Fax(
                item["contact_Fax"])
        else:
            item["contact_Fax"] = ''

        if item["E_Mail"]:
            item["E_Mail"] = self.cw.search_email(item["E_Mail"])
        else:
            item["E_Mail"] = ''

        if item["contact_QQ"]:
            item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
        else:
            item["contact_QQ"] = ''

        if item["company_address"]:
            item["company_address"] = self.cw.search_address(
                item["company_address"])
        else:
            item["company_address"] = ''

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 21

0

Exibir arquivo

class QuanQiuFangZhiWangSpider(CrawlSpider):
    name = "tnc"
    allowed_domains = ['tnc.com.cn', 'www.tnc.com.cn']
    start_urls = ['https://www.tnc.com.cn/company/']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.3,
        'ITEM_PIPELINES': {
            'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302
        },
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control":
            "max-age=0",
            "Connection":
            "keep-alive",
            # "Host": "www.kusoba.com",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543,
        }
    }
    # /c3847/p2/
    rules = (
        Rule(LinkExtractor(
            allow=r".*", restrict_xpaths=("//div[@class='area-list']//li//a")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@class='result-list-company']//p[@class='tit']//a")),
             follow=True),
        Rule(LinkExtractor(allow=r".*",
                           restrict_xpaths=("//a[@class='page-next']")),
             follow=True),
        Rule(LinkExtractor(allow=r".*",
                           restrict_xpaths=("//a[contains(text(),'联系方式')]")),
             callback='parse_items',
             follow=False),
    )

    def parse_items(self, response):
        # print(response.text)
        # <title>Jill:-舟山达利针织有限公司联系方式--全球纺织网</title>
        # 		<meta name="keywords" content="舟山达利针织有限公司，Jill，" />
        # 		<meta name="description" content="舟山达利针织有限公司负责人：Jill，手机：，座机：-0580-8805716，传真：-0580-8805500，详细地址：定海区盐仓"/>
        # '舟山达利针织有限公司负责人：Jill，手机：，座机：-0580-8805716，传真：-0580-8805500，详细地址：定海区盐仓'
        pattern = re.compile(r'<div class="jbxx_zt">(.*?)</div>', re.S)
        pattern1 = re.compile(r'<p class="indouce">(.*?)</p>', re.S)
        pattern2 = re.compile(r'手机：(.*?)，', re.S)
        pattern3 = re.compile(r'座机：(.*?)，', re.S)
        pattern4 = re.compile(r'传真：(.*?)，', re.S)
        pattern5 = re.compile(r'详细地址：(.*)', re.S)
        pattern6 = re.compile(r'负责人：(.*?)，', re.S)
        pattern7 = re.compile(r'<meta name="description" content="(.*?)"/>',
                              re.S)
        item = QuanQiuFangZhiWangspiderItem()
        if response.text:
            try:
                content = "".join(re.findall(
                    pattern7, response.text)) if re.findall(
                        pattern7, response.text) else ''
                item["company_Name"] = "".join(
                    re.findall(pattern, response.text)) if re.findall(
                        pattern, response.text) else ''
                item["kind"] = "".join(re.findall(
                    pattern1, response.text)[0]) if re.findall(
                        pattern1, response.text) else ''
                item["company_address"] = "".join(re.findall(
                    pattern5, content)) if re.findall(pattern5,
                                                      content) else ''
                item["linkman"] = "".join(
                    re.findall(pattern6, content)) if re.findall(
                        pattern6, content) else ''
                item["telephone"] = "".join(
                    re.findall(pattern3, content)) if re.findall(
                        pattern3, content) else ''
                item["phone"] = "".join(
                    re.findall(pattern2, content)) if re.findall(
                        pattern2, content) else ''
                item["contact_Fax"] = "".join(
                    re.findall(pattern4, content)) if re.findall(
                        pattern4, content) else ''
                item["contact_QQ"] = ""
                item["E_Mail"] = ""
                item["Source"] = response.url
                item["province"] = ""
                item["city_name"] = ""

                if item["company_Name"]:
                    item["company_Name"] = re.sub(
                        r'\n|\s|\r|\t|公司名称：|全称：', '',
                        item["company_Name"]).replace(' ', '').strip()
                item["company_id"] = self.get_md5(item["company_Name"])

                if item["kind"]:
                    item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营|主营产品：', '', item["kind"]).replace('-', '|').replace('、', '|') \
                        .replace('，', '|').replace('，', '|').replace('.', '').strip()
                else:
                    item["kind"] = ''

                item["kind"] = self.cw.rinse_keywords(
                    self.cw.replace_ss(item["kind"]))

                if item["linkman"]:
                    if "（" in item["linkman"]:
                        item["linkman"] = item["linkman"].split(
                            "（")[0].replace('法定代表人：', '').replace('暂未公布', '')
                    else:
                        item["linkman"] = item["linkman"].replace(
                            '法定代表人：', '').replace('暂未公布', '')
                else:
                    item["linkman"] = ''
                item["linkman"] = self.cw.search_linkman(item["linkman"])

                if item["phone"]:
                    item["phone"] = self.cw.search_phone_num(item["phone"])
                else:
                    item["phone"] = ''

                if item["telephone"]:
                    item["telephone"] = self.cw.search_telephone_num(
                        item["telephone"])
                else:
                    item["telephone"] = ''

                if item["contact_Fax"]:
                    item["contact_Fax"] = self.cw.search_contact_Fax(
                        item["contact_Fax"])
                else:
                    item["contact_Fax"] = ''

                if item["E_Mail"]:
                    item["E_Mail"] = self.cw.search_email(item["E_Mail"])
                else:
                    item["E_Mail"] = ''

                if item["contact_QQ"]:
                    item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
                else:
                    item["contact_QQ"] = ''

                if item["company_address"]:
                    if "\"" in item["company_address"]:
                        item["company_address"] = item[
                            "company_address"].spilt('"')[0]
                        item["company_address"] = self.cw.search_address(
                            item["company_address"])
                else:
                    item["company_address"] = ''

                yield item
            except:
                return

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 22

0

Exibir arquivo

class ZaoWaiXinXiWangSpider(CrawlSpider):
    name = 'zaow'
    allowed_domains = ['www.zaowai.com']
    start_urls = ['http://www.zaowai.com/page/']
    cw = CleanWords()

    custom_settings = {
        'DOWNLOAD_DELAY': 0.3,
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control":
            "max-age=0",
            "Connection":
            "keep-alive",
            # "Host": "www.yisi.cc",
            # "Referer": "http://www.yisi.cc/",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543,
        }
    }

    rules = (
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=("//div[@class='dirlist whitebg']//li//div//a")),
             follow=True),
        Rule(LinkExtractor(
            allow=r"http\:\/\/www\.zaowai\.com\/com\/.*\/",
            restrict_xpaths=(
                "//div[@id='listcolumn']//li//div[@class='company']//a")),
             callback='parse_items',
             follow=True),
        Rule(LinkExtractor(
            allow=r"http\:\/\/www\.zaowai\.com\/guangdong\/pn\d+\/",
            restrict_xpaths=("//a[contains(text(),'下一页»')]")),
             follow=True),

        # Rule(LinkExtractor(
        #     allow=r".*",restrict_xpaths=("//div[@id='leftmenu']//a[contains(text(),'联系方式')]")),callback='parse_items', follow=False),
    )

    def parse_items(self, response):
        item = ZaoWaiXinXiWangItem()
        item["company_Name"] = response.xpath(
            "//div[@class='companyname']/h1/text()").extract_first()
        # item["company_id"] = md5(item["company_Name"].encode()).hexdigest()
        item["kind"] = response.xpath(
            "//div[@class='shop-keyword']/text()").extract_first()
        item["company_address"] = "".join(
            response.xpath("//li[contains(text(),'公司地址：')]/text()").extract())
        item["linkman"] = "".join(
            response.xpath("//li[contains(text(),'人：')]/text()").extract())
        item["telephone"] = "".join(
            response.xpath("//li[contains(text(),'联系电话：')]/text()").extract())
        item["phone"] = ""
        item["contact_Fax"] = "".join(
            response.xpath("//li[contains(text(),'公司传真：')]/text()").extract())
        item["contact_QQ"] = "".join(
            response.xpath("//img[@alt='联系QQ']/../@href").extract())
        item["E_Mail"] = "".join(
            response.xpath("//li[contains(text(),'电子邮箱：')]/text()").extract())
        item["Source"] = response.url
        item["province"] = ''
        item["city_name"] = ''

        if item["company_Name"]:
            item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：', '',
                                          item["company_Name"]).replace(
                                              ' ', '').strip()
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"]:
            item["kind"] = item["kind"].replace(' ', '|')
            item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \
                .replace('，', '|').replace(',', '|').replace(';','|').replace('.', '').strip()
        else:
            item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

        if item["linkman"]:
            item["linkman"] = re.sub(r'\s|\n|\r|\t', '', item["linkman"])
            item["linkman"] = item["linkman"].replace("联系人：", "")
        else:
            item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"]:
            item["phone"] = self.cw.search_phone_num(item["phone"])
        else:
            item["phone"] = ''

        if item["telephone"]:
            item["telephone"] = self.cw.search_telephone_num(item["telephone"])
        else:
            item["telephone"] = ''

        if item["contact_Fax"]:
            item["contact_Fax"] = self.cw.search_contact_Fax(
                item["contact_Fax"])
        else:
            item["contact_Fax"] = ''

        if item["E_Mail"]:
            item["E_Mail"] = self.cw.search_email(item["E_Mail"])
        else:
            item["E_Mail"] = ''

        if item["contact_QQ"]:
            item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
        else:
            item["contact_QQ"] = ''

        if item["company_address"]:
            item["company_address"] = self.cw.search_address(
                item["company_address"])
        else:
            item["company_address"] = ''

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 23

0

Exibir arquivo

Arquivo: qlw.py Projeto: cogitozz/Big_Scrapy_Spider

class QiLingWangSpider(CrawlSpider):
    name = "qlw"
    allowed_domains = ['www.707070.cn', '707070.cn']
    start_urls = ['http://www.707070.cn/city/']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.3,
        'ITEM_PIPELINES': {
            'BigB2BSpider.pipelines.MysqlTwistedPiplines': 302
        },
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            # "Connection": "keep-alive",
            # "Cookie": "Hm_lvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566177749; CCKF_visitor_id_92126=1219631166; yunsuo_session_verify=db1a03528b7dfe197918cf533946c447; bdshare_firstime=1566178685689; Hm_lpvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566178686",
            # "Host": "jamesni139.tybaba.com",
            # "Referer": "http://jamesni139.tybaba.com/",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420,
        },
        # 不验证SSL证书
        # "DOWNLOAD_HANDLERS_BASE": {
        #     'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler',
        #     'http': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler',
        #     'https': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler',
        #     's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler',
        # },
        # "DOWNLOAD_HANDLERS": {
        #     'https': 'BigB2BSpider.custom.downloader.handler.https.HttpsDownloaderIgnoreCNError'},
    }
    # /c3847/p2/
    rules = (
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=("//div[@id='hot']//dl[@class='area']//h4//a")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@class='content list']//li[@class='listbox-item']//h2//a"
            )),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@class='pages']//a[contains(text(),'下一页')]")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@class='nav']//a[contains(text(),'联系我们')]")),
             callback='parse_items',
             follow=False),
    )

    def parse_items(self, response):
        item = QiLingWangItem()
        pattern = re.compile(r'(1\d{10})', re.S)
        item["company_Name"] = response.xpath(
            "//div[@class='comname']/text()").extract_first()
        item["kind"] = response.xpath(
            "//p[contains(text(),'主营：')]/text()").extract_first()
        item["company_address"] = response.xpath(
            "//div[@class='side']//li[contains(text(),'地址：')]/text()"
        ).extract_first()
        item["linkman"] = response.xpath(
            "//div[@class='main']//li[contains(text(),'联系人：')]/text()"
        ).extract_first()
        item["telephone"] = response.xpath(
            "//div[@class='main']//li[contains(text(),'电话：')]/text()"
        ).extract_first()
        item["phone"] = "".join(re.findall(pattern,
                                           response.text)[0]) if re.findall(
                                               pattern, response.text) else ''
        item["contact_Fax"] = response.xpath(
            "//div[@class='main']//li[contains(text(),'传真：')]/text()"
        ).extract_first()
        item["contact_QQ"] = response.xpath(
            "//div[@class='main']//li[contains(text(),'Q Q：')]/text()"
        ).extract_first()
        item["E_Mail"] = ''
        item["Source"] = response.url
        item["province"] = ""
        item["city_name"] = ""

        if item["company_Name"]:
            if "（" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('（')[0]
            elif "(" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('(')[0]
            else:
                item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：', '',
                                              item["company_Name"]).replace(
                                                  ' ', '').strip()
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"]:
            item["kind"] = item["kind"].replace(' ', '|')
            item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营', '', item["kind"]).replace('-', '|').replace('、', '|')\
                .replace(',', '|').replace('，', '|').replace(';','|').replace('.', '').strip()
        else:
            item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

        if item["linkman"]:
            item["linkman"] = item["linkman"]
        else:
            item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"]:
            item["phone"] = self.cw.search_phone_num(item["phone"])
        else:
            item["phone"] = ''

        if item["telephone"]:
            item["telephone"] = self.cw.search_telephone_num(item["telephone"])
        else:
            item["telephone"] = ''

        if item["contact_Fax"]:
            item["contact_Fax"] = self.cw.search_contact_Fax(
                item["contact_Fax"])
        else:
            item["contact_Fax"] = ''

        if item["E_Mail"]:
            item["E_Mail"] = self.cw.search_email(item["E_Mail"])
        else:
            item["E_Mail"] = ''

        if item["contact_QQ"]:
            item["contact_QQ"] = item["contact_QQ"].replace("Q Q：", '')
        else:
            item["contact_QQ"] = ''
        item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])

        if item["company_address"]:
            item["company_address"] = self.cw.search_address(
                item["company_address"])
        else:
            item["company_address"] = ''

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 24

0

Exibir arquivo

Arquivo: qy39.py Projeto: cogitozz/Big_Scrapy_Spider

class QiYe39Spider(CrawlSpider):
    name = 'qy39'
    # pattern = re.compile(r'(.*?).qy39.com')
    allowed_domains = ['qy39.com', 'www.qy39.com']
    start_urls = ['http://www.qy39.com/company/']
    cw = CleanWords()

    custom_settings = {
        'DOWNLOAD_DELAY': 0.2,
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control":
            "max-age=0",
            "Connection":
            "keep-alive",
            # "Cookie": "ASPSESSIONIDQQQBDBAD=PAKPCNLDKGAMHCDLNGBGEPLF; Hm_lvt_539760cac714bd8993dbfb0c1dfb96f7=1564976804; UM_distinctid=16c5fe2f1455-07d6d8162e26fa-5a13331d-1fa400-16c5fe2f14637e; CNZZDATA3636164=cnzz_eid%3D831514679-1564972722-https%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1564972722; ASPSESSIONIDSQRAAAAC=HEKFDNLDGBDAENAHAMHCLMHJ; Hm_lpvt_539760cac714bd8993dbfb0c1dfb96f7=1564976846",
            # "Host": "www.qy39.com",
            # "Referer": "http://www.qy39.com/beijing-huangye/10",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 543,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543,
        }
    }

    def parse(self, response):
        a_list = response.xpath("///div[@class='categoryList']//ul//li//a")
        for a in a_list:
            kind_name = a.xpath("./text()").extract_first()
            kind_href = a.xpath("./@href").extract_first()
            if kind_href:
                kind_href = "http://www.qy39.com" + kind_href
                # print(kind_name,kind_href)
                yield scrapy.Request(url=kind_href,
                                     callback=self.parse_company_list,
                                     dont_filter=True)

    def parse_company_list(self, response):
        li_list = response.xpath("//div[@class='listMain02']//ul//li")
        for li in li_list:
            item = QiYe39spiderItem()
            pattern = re.compile(r'\[(.*?)\/(.*?)\]', re.S)
            item["company_Name"] = li.xpath(
                ".//a[@class='proName02']/text()").extract_first()
            company_href = li.xpath(
                ".//a[@class='proName02']/@href").extract_first()
            if company_href:
                # print(company_href)
                contact_href = company_href + "/qyjs/"
                # print(contact_href)
                yield scrapy.Request(url=contact_href,
                                     callback=self.parse_company_contact,
                                     meta={"item": item},
                                     dont_filter=True)

        next_page_url = response.xpath(
            "//div[@class='listPage']//a[contains(text(),'下一页')]/@href"
        ).extract_first()
        if next_page_url:
            next_page_url = next_page_url.replace(".", "")
            next_page_url = "http://www.qy39.com" + next_page_url
            # print(next_page_url)
            yield scrapy.Request(url=next_page_url,
                                 callback=self.parse_company_list)

    def parse_company_contact(self, response):
        item = response.meta["item"]
        # pattern = re.compile(r'<meta name="keywords" content="(.*?)" />',re.S)
        # pattern1 = re.compile(r'<li>主营产品： (.*?)</li>', re.S)
        # pattern2 = re.compile(r'<li>所在地区：(.*?)</li>', re.S)
        # pattern3 = re.compile(r'<li>联系人：(.*?)</li>', re.S)
        # pattern4 = re.compile(r'<li>手机：(.*?)</li>', re.S)
        # pattern5 = re.compile(r'<li>联系电话：(.*?)</li>', re.S)
        # pattern6 = re.compile(r'<li>公司传真：(.*?)</li>', re.S)
        # pattern7 = re.compile(r'href="tencent://message/?Site=jiancai.com&amp;Uin=(.*?)&amp;Menu=yes"',re.S)
        # pattern8 = re.compile(r'>\s*联系人：(.*?)</li>',re.S)

        item["company_Name"] = response.xpath(
            "//th[contains(text(),'公司名称')]/following-sibling::td//font/text()"
        ).extract_first()
        # item["company_id"] = md5(item["company_Name"].encode()).hexdigest()
        item["kind"] = ""
        item["company_address"] = "".join(
            response.xpath(
                "//th[contains(text(),'企业地址')]/following-sibling::td/text()").
            extract())
        item["linkman"] = "".join(
            response.xpath(
                "//span[contains(text(),'联系人：')]/..//text()").extract())
        item["telephone"] = "".join(
            response.xpath(
                "//span[contains(text(),'电话：')]/..//text()").extract())
        item["phone"] = "".join(
            response.xpath(
                "//span[contains(text(),'手机：')]/..//text()").extract())
        item["E_Mail"] = "".join(
            response.xpath(
                "//th[contains(text(),'电子邮件')]/following-sibling::td/text()").
            extract())
        item["contact_Fax"] = "".join(
            response.xpath(
                "//th[contains(text(),'传真号码')]/following-sibling::td/text()").
            extract())
        item["contact_QQ"] = ""
        item["province"] = ''
        item["city_name"] = ''
        item["Source"] = response.url

        if item["company_Name"]:
            item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：', '',
                                          item["company_Name"]).replace(
                                              ' ', '').strip()
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"]:
            item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \
                .replace('，', '|').replace('，', '|').replace('.', '').strip()
        else:
            item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

        if item["linkman"]:
            item["linkman"] = item["linkman"]
        else:
            item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"]:
            item["phone"] = item["phone"]
        else:
            item["phone"] = ''
        item["phone"] = self.cw.search_phone_num(item["phone"])

        if item["telephone"]:
            item["telephone"] = item["telephone"]
        else:
            item["telephone"] = ''
        item["telephone"] = self.cw.search_telephone_num(item["telephone"])

        if item["contact_Fax"]:
            item["contact_Fax"] = item["contact_Fax"]
        else:
            item["contact_Fax"] = ''
        item["contact_Fax"] = self.cw.search_contact_Fax(item["contact_Fax"])

        if item["E_Mail"]:
            item["E_Mail"] = self.cw.search_email(item["E_Mail"])
        else:
            item["E_Mail"] = ''

        if item["contact_QQ"]:
            item["contact_QQ"] = item["contact_QQ"]
        else:
            item["contact_QQ"] = ''
        item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])

        if item["company_address"]:
            item["company_address"] = item["company_address"]
        else:
            item["company_address"] = ''
        item["company_address"] = self.cw.search_address(
            item["company_address"])

        # if item["host_href"]:
        #     item["host_href"] = item["host_href"]
        # else:
        #     item["host_href"] = ''

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 25

0

Exibir arquivo

Arquivo: salqu.py Projeto: cogitozz/Big_Scrapy_Spider

class ShangQuWang(CrawlSpider):
    name = 'salqu'
    allowed_domains = ['www.salqu.com']
    start_urls = ['http://www.salqu.com/company/']
    cw = CleanWords()

    custom_settings = {
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control":
            "max-age=0",
            "Connection":
            "keep-alive",
            "Cookie":
            "PHPSESSID=gluoteaio8b0brt17o9msoqa14",
            # "Host": "www.salqu.com",
            # "Referer": "http://www.salqu.com/",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 543,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543,
        }
    }

    def parse(self, response):
        a_list = response.xpath(
            "//div[@class='col-xs-12']//dd[@class='place']//a")
        for a in a_list:
            kind_name = a.xpath("./text()").extract_first()
            kind_href = a.xpath("./@href").extract_first()
            if kind_href:
                # print(kind_name,kind_href)
                yield scrapy.Request(url=kind_href,
                                     callback=self.parse_company_list,
                                     dont_filter=True)

    def parse_company_list(self, response):
        div_list = response.xpath(
            "//div[@id='list']//div[@class='company-wrap']")
        for div in div_list:
            item = ShangQuWangspiderItem()
            # pattern = re.compile(r'(.*?)\/(.*?)', re.S)
            item["company_Name"] = div.xpath(
                ".//span[@class='company-name']/a/text()").extract_first()
            company_href = div.xpath(
                ".//span[@class='company-name']/a/@href").extract_first()
            item["kind"] = div.xpath(
                ".//span[contains(text(),'主营产品：')]/text()").extract_first()
            city_infos = div.xpath(
                ".//p[contains(text(),'所在地区')]/following-sibling::p/text()"
            ).extract_first()
            if city_infos:
                # 广东/潮州市
                try:
                    item["province"] = city_infos.split("/")[0]
                    item["city_name"] = city_infos.split("/")[1]
                except:
                    item["province"] = city_infos
                    item["city_name"] = ''
            else:
                item["province"] = ''
                item["city_name"] = ''
            if company_href:
                # print(company_href)
                contact_href = company_href + "contact/"
                # print(contact_href)
                yield scrapy.Request(url=contact_href,
                                     callback=self.parse_company_contact,
                                     meta={"item": item},
                                     dont_filter=True)

        next_page_url = response.xpath(
            "//div[@class='pages']//a[contains(text(),'下一页»')]/@href"
        ).extract_first()
        if next_page_url:
            yield scrapy.Request(url=next_page_url,
                                 callback=self.parse_company_list)

    def parse_company_contact(self, response):
        headers = {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control":
            "max-age=0",
            "Connection":
            "keep-alive",
            "Cookie":
            "PHPSESSID=gluoteaio8b0brt17o9msoqa14",
            # "Host": "xyblifei.salqu.com",
            "Referer":
            response.url,
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        }
        item = response.meta["item"]
        item["company_Name"] = response.xpath(
            "//td[contains(text(),'公司名称：')]/following-sibling::td/text()"
        ).extract_first()
        # item["company_id"] = md5(item["company_Name"].encode()).hexdigest()
        # item["kind"] = response.xpath("//div[@class='head']/h4/text()").extract_first()
        item["company_address"] = "".join(
            response.xpath(
                "//td[contains(text(),'公司地址：')]/following-sibling::td/a/text()"
            ).extract())
        item["linkman"] = "".join(
            response.xpath(
                "//td[contains(text(),'联 系 人：')]/following-sibling::td/text()"
            ).extract())
        item["telephone"] = "".join(
            response.xpath(
                "//td[contains(text(),'公司电话：')]/following-sibling::td/img/@src"
            ).extract())
        item["phone"] = "".join(
            response.xpath(
                "//td[contains(text(),'手机号码：')]/following-sibling::td/img/@src"
            ).extract())
        item["E_Mail"] = "".join(
            response.xpath(
                "//td[contains(text(),'电子邮件：')]/following-sibling::td/img/@src"
            ).extract())
        item["contact_Fax"] = "".join(
            response.xpath(
                "//td[contains(text(),'公司传真：')]/following-sibling::td/img/@src"
            ).extract())
        item["contact_QQ"] = "".join(
            response.xpath("//img[@title='点击QQ交谈/留言']/../@href").extract())
        item["Source"] = response.url

        if item["company_Name"]:
            item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：', '',
                                          item["company_Name"]).replace(
                                              ' ', '').strip()
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"]:
            item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营产品：|主营', '', item["kind"]).replace('-', '|').replace('、', '|') \
                .replace('，', '|').replace('，', '|').replace('.', '').strip()
        else:
            item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

        if item["linkman"]:
            item["linkman"] = item["linkman"]
        else:
            item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"]:
            item["phone"] = self.requests_href(item["phone"], headers)
            item["phone"] = self.cw.search_phone_num(item["phone"])
        else:
            item["phone"] = ''

        if item["telephone"]:
            item["telephone"] = self.requests_href(item["telephone"], headers)
            item["telephone"] = self.cw.search_telephone_num(item["telephone"])
        else:
            item["telephone"] = ''

        if item["contact_Fax"]:
            item["contact_Fax"] = self.requests_href(item["contact_Fax"],
                                                     headers)
            item["contact_Fax"] = self.cw.search_contact_Fax(
                item["contact_Fax"])
        else:
            item["contact_Fax"] = ''

        if item["E_Mail"]:
            item["E_Mail"] = self.requests_href(item["E_Mail"], headers)
            if item["E_Mail"]:
                item["E_Mail"] = item["E_Mail"].replace("e", "@").replace(
                    "8126", "@163").replace("8163", "@163").strip()
            # item["E_Mail"] = self.cw.search_email(item["E_Mail"])
        else:
            item["E_Mail"] = ''

        if item["contact_QQ"]:
            item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
        else:
            item["contact_QQ"] = ''

        if item["company_address"]:
            item["company_address"] = self.cw.search_address(
                item["company_address"])
        else:
            item["company_address"] = ''

        # if item["host_href"]:
        #     item["host_href"] = item["host_href"]
        # else:
        #     item["host_href"] = ''

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

    def requests_href(self, url, headers):
        res = requests.get(url=url, headers=headers, timeout=20, verify=False)
        res.encoding = "utf-8"
        if res.status_code == requests.codes.ok:
            img = res.content
            something_img_file_path = r"F:\PythonProjects\venv\pythonProjects\BigB2BSpider\BigB2BSpider\img_src\something_img\image.png"
            with open(something_img_file_path, "wb") as fp:
                fp.write(img)
            fp.close()
            if img:
                try:
                    something = recognition_image(something_img_file_path)
                    if something:
                        return something
                    else:
                        return ''
                except:
                    return ''
            else:
                return ''
        else:
            return ''

Exemplo n.º 26

0

Exibir arquivo

class WuJiuShangWuWangSpider(CrawlSpider):
    name = 'sw59'
    allowed_domains = ['www.59b2b.com']
    start_urls = ['http://www.59b2b.com/company/']
    cw = CleanWords()

    custom_settings = {
        'DOWNLOAD_DELAY': 0.3,
        'DEFAULT_REQUEST_HEADERS': {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            # "Cookie": "Hm_lvt_a8ba6329e67dda68e5d5dadf2df13e01=1564989677; Hm_lpvt_a8ba6329e67dda68e5d5dadf2df13e01=1564989683; security_session_verify=804231afb453a0f6d1f322329f5bfb57",
            # "Host": "www.yiwangtui.com",
            # "Referer": "http://www.yiwangtui.com/",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 543,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 543,
        }
    }

    def parse(self, response):
        a_list = response.xpath("//div[@class='left_box']//tr//a")
        for a in a_list:
            kind_name = a.xpath("./text()").extract_first()
            kind_href = a.xpath("./@href").extract_first()
            if kind_href:
                print(kind_name,kind_href)
                yield scrapy.Request(
                    url=kind_href,
                    callback=self.parse_company_list,
                    dont_filter=True
                )

    def parse_company_list(self, response):
        tr_list = response.xpath("//div[@class='left_box']//div[@class='list']//table//tr")
        for tr in tr_list:
            item = WuJiuShangWuWangspiderItem()
            pattern = re.compile(r'\[(.*?)/(.*?)\]', re.S)
            item["company_Name"] = tr.xpath(".//li//a/strong/text()").extract_first()
            company_href = tr.xpath(".//li/a/@href").extract_first()
            item["kind"] = tr.xpath(".//li[contains(text(),'主营：')]/text()").extract_first()
            city_infos = tr.xpath(".//td[@class='f_orange']/text()").extract_first()
            if city_infos:
                # 广东/潮州市
                try:
                    item["province"] = re.findall(pattern, city_infos)[0][0]
                    item["city_name"] = re.findall(pattern, city_infos)[0][1]
                except:
                    item["province"] = ''
                    item["city_name"] = ''
            else:
                item["province"] = ''
                item["city_name"] = ''

            if item["company_Name"]:
                item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：', '', item["company_Name"]).replace(' ', '').strip()
                item["company_id"] = self.get_md5(item["company_Name"])

            if item["kind"]:
                item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营', '', item["kind"]).replace('-', '|')\
                    .replace('、', '|').replace('，', '|').replace('，', '|').replace('.', '').strip()
            else:
                item["kind"] = ''

            item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

            item["linkman"] = ''
            item["phone"] = ''
            item["telephone"] = ''
            item["contact_Fax"] = ''
            item["contact_QQ"] = ''
            item["E_Mail"] = ''
            item["company_address"] = ''
            item["Source"] = response.url
            yield item

        next_page_url = response.xpath("//div[@class='pages']//a[contains(text(),'下一页»')]/@href").extract_first()
        if next_page_url:
            yield scrapy.Request(
                url=next_page_url,
                callback=self.parse_company_list
            )

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 27

0

Exibir arquivo

class ZhongGuoJiChuangWangSpider(CrawlSpider):
    name = "machine35"
    allowed_domains = ['www.machine35.com','achine35.com','search.machine35.com','vip.machine35.com']
    start_urls = ['http://www.machine35.com/company/']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.5,
        'ITEM_PIPELINES': {'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302},
        'DEFAULT_REQUEST_HEADERS': {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420,
        },
    }
    # /c3847/p2/
    rules = (
        Rule(LinkExtractor(
            allow=r".*",restrict_xpaths=("//div[@class='content']//dl//dd//a")),follow=True),

        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=("//a[contains(text(),'联系方式')]")),callback="parse_items",follow=True),

        Rule(LinkExtractor(
            allow=r".*", restrict_xpaths=("//a[contains(text(),'下一页')]")), follow=True),
    )

    def parse_items(self, response):
        item = ZhongGuoJiChuangWangItem()
        pattern = re.compile(r"href='http:\/\/wpa.qq.com\/msgrd\?v=3&uin=(\d+)&site=machine35.com&menu=yes'",re.S)
        item["company_Name"] = "".join(response.xpath("//dt[@class='maintitle']//text()").extract())
        item["kind"] = response.xpath("//dd[@class='subtitle']/text()").get()
        item["company_address"] = response.xpath("//td[contains(text(),'地　　址：')]/following-sibling::td/text()").extract_first()
        item["linkman"] = response.xpath("///span[@class='blue']/text()").extract_first()
        item["telephone"] = response.xpath("//td[contains(text(),'电　　话：')]/following-sibling::td/text()").extract_first()
        item["phone"] = response.xpath("//td[contains(text(),'手　　机：')]/following-sibling::td/text()").extract_first()
        item["contact_Fax"] = response.xpath("///td[contains(text(),'传　　真：')]/following-sibling::td/text()").extract_first()
        item["contact_QQ"] = "".join(re.findall(pattern,response.text)) if re.findall(pattern,response.text) else ''
        item["E_Mail"] = response.xpath("//td[contains(text(),'邮　　箱：')]/following-sibling::td/text()").extract_first()
        item["Source"] = response.url
        city_infos = response.xpath("//dt[contains(text(),'所在地区：')]/following-sibling::dd/text()").get()


        if item["company_Name"]:
            item["company_Name"] = self.cw.search_company(item["company_Name"])
        else:
            item["company_Name"] = ''
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"]:
            item["kind"] = item["kind"].replace(" ", '|')
            item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营|主营项目：|供应商', '', item["kind"]).replace('-', '|')\
                .replace('、', '|').replace(',', '|').replace('，', '|').replace(';','|').replace('.', '').strip()
        else:
            item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

        if item["linkman"]:
            item["linkman"] = self.cw.search_linkman(item["linkman"])
        else:
            item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"]:
            item["phone"] = self.cw.search_phone_num(item["phone"])
        else:
            item["phone"] = ''

        if item["telephone"]:
            item["telephone"] = self.cw.search_telephone_num(item["telephone"])
        else:
            item["telephone"] = ''

        if item["contact_Fax"]:
            item["contact_Fax"] = self.cw.search_contact_Fax(item["contact_Fax"])
        else:
            item["contact_Fax"] = ''

        if item["E_Mail"]:
            item["E_Mail"] = self.cw.search_email(item["E_Mail"])
        else:
            item["E_Mail"] = ''

        if item["contact_QQ"]:
            item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
        else:
            item["contact_QQ"] = ''

        if item["company_address"]:
            item["company_address"] = item["company_address"].replace("联系地址：","")
            item["company_address"] = self.cw.search_address(item["company_address"])
        else:
            item["company_address"] = ''

        # if city_infos:
        if item["company_address"]:
            if '市' and '省' in item["company_address"]:
                try:
                    pattern_p = re.compile(r'(.*?)省', re.S)
                    pattern_c = re.compile(r'省(.*?)市', re.S)
                    item["province"] = "".join(re.findall(pattern_p, item["company_address"])) \
                        if re.findall(pattern_p, item["company_address"]) else ''
                    item["city_name"] = "".join(re.findall(pattern_c, item["company_address"])) \
                        if re.findall(pattern_c, item["company_address"]) else ''
                except:
                    item["province"] = ''
                    item["city_name"] = ''
            else:
                item["province"] = ''
                item["city_name"] = ''
        else:
            item["province"] = ''
            item["city_name"] = ''

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 28

0

Exibir arquivo

class WuYouJiaoYiWangSpider(CrawlSpider):
    name = "ec51"
    allowed_domains = ['www.ec51.com','ec51.com']
    start_urls = ['https://www.ec51.com/site/company.html']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.3,
        'ITEM_PIPELINES': {'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302},
        'DEFAULT_REQUEST_HEADERS': {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            # "Connection": "keep-alive",
            # "Cookie": "Hm_lvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566177749; CCKF_visitor_id_92126=1219631166; yunsuo_session_verify=db1a03528b7dfe197918cf533946c447; bdshare_firstime=1566178685689; Hm_lpvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566178686",
            # "Host": "jamesni139.tybaba.com",
            # "Referer": "http://jamesni139.tybaba.com/",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420,
        },
        # 不验证SSL证书
        # "DOWNLOAD_HANDLERS_BASE": {
        #     'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler',
        #     'http': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler',
        #     'https': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler',
        #     's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler',
        # },
        # "DOWNLOAD_HANDLERS": {
        #     'https': 'BigB2BSpider.custom.downloader.handler.https.HttpsDownloaderIgnoreCNError'},
    }

    def parse(self, response):
        div_list = response.xpath("//div[@class='flex-item product-wrap']//div[@class='product flex']")
        for div in div_list:
            item = WuYouJiaoYiWangItem()
            item["company_Name"] = div.xpath(".//a[@class='title ea']/text()").get()
            item["kind"] = div.xpath(".//p[contains(text(),'经营范围：')]/text()").get()
            item["company_address"] = div.xpath(".//p[contains(text(),'公司地址：')]/text()").get()
            item["linkman"] = ''
            item["telephone"] = div.xpath(".//p[contains(text(),'联系方式：')]/text()").get()
            item["phone"] = div.xpath(".//p[contains(text(),'联系方式：')]/text()").get()
            item["contact_Fax"] = ''
            item["contact_QQ"] = ''
            item["E_Mail"] = ''
            item["Source"] = response.url
            item["province"] = ""
            item["city_name"] = ""

            if item["company_Name"]:
                item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：', '', item["company_Name"]).replace(' ', '').strip()
            item["company_id"] = self.get_md5(item["company_Name"])

            if item["kind"]:
                item["kind"] = item["kind"].replace(' ', '|')
                item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营产品|经营范围：', '', item["kind"]).replace('-', '|')\
                    .replace('、', '|').replace(',', '|').replace('，', '|').replace(';','|').replace('.', '').strip()
            else:
                try:
                    item["kind"] = ",".join(response.xpath("//p[contains(text(),'公司标签：')]//a/text()").getall())
                    item["kind"] = item["kind"].replace(' ', '|')
                    item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营产品|经营范围：', '', item["kind"]).replace('-', '|') \
                        .replace('、', '|').replace(',', '|').replace('，', '|').replace(';', '|').replace('.',
                                                                                                         '').strip()
                except:
                    item["kind"] = ''

            item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

            if item["linkman"]:
                item["linkman"] = item["linkman"]
            else:
                item["linkman"] = ''
            item["linkman"] = self.cw.search_linkman(item["linkman"])

            if item["phone"]:
                item["phone"] = self.cw.search_phone_num(item["phone"])
            else:
                item["phone"] = ''

            if item["telephone"]:
                item["telephone"] = self.cw.search_telephone_num(item["telephone"])
            else:
              item["telephone"] = ''

            if item["contact_Fax"]:
                item["contact_Fax"] = self.cw.search_contact_Fax(item["contact_Fax"])
            else:
                item["contact_Fax"] = ''

            if item["E_Mail"]:
                item["E_Mail"] = self.cw.search_email(item["E_Mail"])
            else:
                item["E_Mail"] = ''

            if item["contact_QQ"]:
                item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
            else:
                try:
                    item["contact_QQ"] = self.cw.search_QQ(item["E_Mail"])
                except:
                    item["contact_QQ"] = ''

            if item["company_address"]:
                item["company_address"] = self.cw.search_address(item["company_address"])
            else:
                item["company_address"] = ''

            yield item

        next_page_url = response.xpath("//li[@class='next']//a[contains(text(),'下一页')]/@href").get()
        if next_page_url:
            next_page_url = "https://www.ec51.com" + next_page_url
            yield scrapy.Request(
                url=next_page_url,
                callback=self.parse
            )


    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 29

0

Exibir arquivo

Arquivo: cnexpnet.py Projeto: cogitozz/Big_Scrapy_Spider

class GongQiuXingXiWangSpider(CrawlSpider):
    name = "cnexpnet"
    allowed_domains = ['cnexpnet.net']
    start_urls = ['http://cnexpnet.net/company/']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.5,
        'ITEM_PIPELINES': {
            'BigB2BSpider.pipelines.MysqlTwistedPiplines_v1': 302
        },
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420,
        },
    }
    # /c3847/p2/
    rules = (
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=("///div[@class='list-cate']//td//a")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=
            ("//div[@class='m m2']//div[@class='list']//td[@align='left']//li//a"
             )),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@class='pages']//a[contains(text(),'下一页»')]")),
             follow=True),
        Rule(LinkExtractor(
            allow=r".*",
            restrict_xpaths=(
                "//div[@class='menu']//span[contains(text(),'联系方式')]/..")),
             callback='parse_items',
             follow=True),
    )

    def parse_items(self, response):
        item = GongQiuXingXiWangItem()
        item["company_Name"] = response.xpath(
            "//td[contains(text(),'公司名称：')]/following-sibling::td/text()"
        ).extract_first()
        item["company_address"] = response.xpath(
            "//td[contains(text(),'公司地址：')]/following-sibling::td/text()"
        ).extract_first()
        item["linkman"] = response.xpath(
            "//td[contains(text(),'联 系 人：')]/following-sibling::td/text()"
        ).extract_first()
        item["telephone"] = response.xpath(
            "//td[contains(text(),'公司电话：')]/following-sibling::td/text()"
        ).extract_first()
        item["phone"] = response.xpath(
            "//td[contains(text(),'手机号码：')]/following-sibling::td/text()"
        ).extract_first()
        item["contact_Fax"] = response.xpath(
            "//td[contains(text(),'公司传真：')]/following-sibling::td/text()"
        ).extract_first()
        item["contact_QQ"] = response.xpath(
            "//img[@title='点击QQ交谈/留言']/../@href").extract_first()
        item["E_Mail"] = response.xpath(
            "//td[contains(text(),'电子邮件：')]/following-sibling::td/text()"
        ).extract_first()
        item["Source"] = response.url
        item["kind"] = ",".join(
            response.xpath("//div[@class='head']//h4/text()").getall())
        city_infos = response.xpath(
            "//td[contains(text(),'所在地区：')]/following-sibling::td/text()").get(
            )

        if item["company_Name"] and item["company_Name"] != '':
            if "（" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('（')[0]
            elif "(" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('(')[0]
            elif "_" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('_')[0]
            elif "-" in item["company_Name"]:
                item["company_Name"] = item["company_Name"].split('-')[0]
            else:
                item["company_Name"] = re.sub(r'\n|\s|\r|\t|公司名称：', '',
                                              item["company_Name"]).replace(
                                                  ' ', '').strip()
        else:
            return
        item["company_id"] = self.get_md5(item["company_Name"])

        if item["kind"]:
            item["kind"] = item["kind"].replace(" ", '|')
            item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营|主营项目：', '', item["kind"]).replace('-', '|')\
                .replace('、', '|').replace(',', '|').replace('，', '|').replace(';','|').replace('.', '').strip()
        else:
            item["kind"] = ''

        item["kind"] = self.cw.rinse_keywords(self.cw.replace_ss(item["kind"]))

        if item["linkman"]:
            item["linkman"] = item["linkman"].replace('未填写', '')
        else:
            item["linkman"] = ''
        item["linkman"] = self.cw.search_linkman(item["linkman"])

        if item["phone"]:
            item["phone"] = self.cw.search_phone_num(item["phone"])
        else:
            item["phone"] = ''

        if item["telephone"]:
            item["telephone"] = self.cw.search_telephone_num(item["telephone"])
        else:
            item["telephone"] = ''

        if item["contact_Fax"]:
            item["contact_Fax"] = self.cw.search_contact_Fax(
                item["contact_Fax"])
        else:
            item["contact_Fax"] = ''

        if item["E_Mail"]:
            item["E_Mail"] = self.cw.search_email(item["E_Mail"])
        else:
            item["E_Mail"] = ''

        if item["contact_QQ"]:
            item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
        else:
            item["contact_QQ"] = ''

        if item["company_address"]:
            item["company_address"] = self.cw.search_address(
                item["company_address"])
        else:
            item["company_address"] = ''

        if city_infos:
            if '/' in city_infos:
                try:
                    item["province"] = city_infos.split('/')[0]
                    item["city_name"] = city_infos.split('/')[1]
                except:
                    item["province"] = ''
                    item["city_name"] = ''
            else:
                item["province"] = ''
                item["city_name"] = ''
        else:
            item["province"] = ''
            item["city_name"] = ''

        yield item

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''

Exemplo n.º 30

0

Exibir arquivo

Arquivo: ewb_v1.py Projeto: cogitozz/Big_Scrapy_Spider

class ErWuBaWangSpider(CrawlSpider):
    name = "ewb"
    allowed_domains = ['258.com', 'www.258.com', 'shop.258.com']
    start_urls = ['http://www.258.com/company/']
    cw = CleanWords()
    # redis_key = "ksb:start_urls"

    custom_settings = {
        'DOWNLOAD_DELAY': 0.2,
        'ITEM_PIPELINES': {
            'BigB2BSpider.pipelines.MysqlTwistedPiplines': 302
        },
        'DEFAULT_REQUEST_HEADERS': {
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
            "Accept-Encoding":
            "gzip, deflate",
            "Accept-Language":
            "zh-CN,zh;q=0.9,en;q=0.8",
            # "Connection": "keep-alive",
            # "Cookie": "Hm_lvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566177749; CCKF_visitor_id_92126=1219631166; yunsuo_session_verify=db1a03528b7dfe197918cf533946c447; bdshare_firstime=1566178685689; Hm_lpvt_dd0c9f5bb6bab19ccc2b13c4ec58552a=1566178686",
            # "Host": "jamesni139.tybaba.com",
            # "Referer": "http://jamesni139.tybaba.com/",
            "Upgrade-Insecure-Requests":
            "1",
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36",
        },
        'DOWNLOADER_MIDDLEWARES': {
            'BigB2BSpider.middlewares.Bigb2BspiderDownloaderMiddleware': 544,
            # 'BigB2BSpider.middlewares.RandomMyProxyMiddleware': 420,
        },
        # 不验证SSL证书
        # "DOWNLOAD_HANDLERS_BASE": {
        #     'file': 'scrapy.core.downloader.handlers.file.FileDownloadHandler',
        #     'http': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler',
        #     'https': 'scrapy.core.downloader.handlers.http.HttpDownloadHandler',
        #     's3': 'scrapy.core.downloader.handlers.s3.S3DownloadHandler',
        # },
        # "DOWNLOAD_HANDLERS": {
        #     'https': 'BigB2BSpider.custom.downloader.handler.https.HttpsDownloaderIgnoreCNError'},
    }

    def parse(self, response):
        a_list = response.xpath(
            "//li[@class='relative']//div[@class='ProductIndexRightNav']//li//a"
        )
        for a in a_list:
            kind_href = a.xpath("./@href").get()
            kind_name = a.xpath("./text()").get()
            if kind_href:
                kind_href = "http://www.258.com" + kind_href
                # print(kind_name,kind_href)
                yield scrapy.Request(url=kind_href,
                                     callback=self.parse_company_list,
                                     dont_filter=True)

    def parse_company_list(self, response):
        div_list = response.xpath(
            "//div[@class='ovh mt10 ']//div[@class='qyk_sublistleft iconboxAll ']"
        )
        for div in div_list:
            company_Name = div.xpath(".//h3/a/text()").get()
            company_href = div.xpath(".//h3/a/@href").get()
            kind = div.xpath(
                "//span[contains(text(),'主营产品：')]/../text()").get()
            if company_href:
                yield scrapy.Request(url=company_href,
                                     callback=self.parse_company_detail,
                                     dont_filter=True)
        # $.goToPage(4,this,'/Company/getList/cg/92/p/4')
        next_url = response.xpath("//a[contains(text(),'下一页')]/@onclick").get()
        pattern = re.compile(r"\$\.goToPage\(\d+,this,'(.*?)'\)", re.S)
        try:
            url = "".join(re.findall(pattern, next_url))
            if url:
                url = url.replace("getList/cg/", '')
                next_page_url = "http://www.258.com" + url
                if next_page_url:
                    yield scrapy.Request(url=next_page_url,
                                         callback=self.parse_company_list,
                                         dont_filter=True)
        except:
            return

    def parse_company_detail(self, response):
        contact_href = response.xpath(
            "//a[contains(text(),'联系方式')]/@href").get()
        if contact_href:
            yield scrapy.Request(url=contact_href,
                                 callback=self.parse_company_contact,
                                 dont_filter=True)

    def parse_company_contact(self, response):
        pattern = re.compile(r'<span class="cp-name">(.*?)</span>', re.S)
        pattern1 = re.compile(r'>联系人：(.*?) <', re.S)
        pattern2 = re.compile(r'>QQ：(.*?)<', re.S)
        pattern3 = re.compile(r'>电话：(.*?)<', re.S)
        pattern4 = re.compile(r'>手机：(.*?)<', re.S)
        pattern5 = re.compile(r'>传真：(.*?)<', re.S)
        pattern6 = re.compile(r'>邮箱：(.*?)<', re.S)
        pattern7 = re.compile(r'>地址：(.*?)\s*<', re.S)
        pattern8 = re.compile(
            r'<input type="hidden" id="business_address" value="(.*?)" />',
            re.S)
        pattern9 = re.compile(r'>主营产品：(.*?)<', re.S)
        # pattern10 = re.compile(r'>所在地区：上海市 市辖区<')
        item = ErWuBaWangItem()
        if response.text:
            try:
                item["company_Name"] = "".join(
                    re.findall(pattern, response.text)) if re.findall(
                        pattern, response.text) else ''
                item["kind"] = "".join(re.findall(
                    pattern9, response.text)) if re.findall(
                        pattern9, response.text) else ''
                item["company_address"] = "".join(
                    re.findall(pattern7, response.text)[0]) if re.findall(
                        pattern7, response.text) else ''
                item["linkman"] = "".join(re.findall(
                    pattern1, response.text)) if re.findall(
                        pattern1, response.text) else ''
                item["telephone"] = "".join(re.findall(
                    pattern3, response.text)) if re.findall(
                        pattern3, response.text) else ''
                item["phone"] = "".join(re.findall(
                    pattern4, response.text)) if re.findall(
                        pattern4, response.text) else ''
                item["contact_Fax"] = "".join(
                    re.findall(pattern5, response.text)) if re.findall(
                        pattern5, response.text) else ''
                item["contact_QQ"] = "".join(
                    re.findall(pattern2, response.text)) if re.findall(
                        pattern2, response.text) else ''
                item["E_Mail"] = "".join(re.findall(
                    pattern6, response.text)) if re.findall(
                        pattern6, response.text) else ''
                item["Source"] = response.url
                item["province"] = ""
                item["city_name"] = ""

                if item["company_Name"]:
                    item["company_Name"] = re.sub(
                        r'\n|\s|\r|\t|公司名称：|企 业 名 称：', '',
                        item["company_Name"]).replace(' ', '').strip()
                item["company_id"] = self.get_md5(item["company_Name"])

                if item["kind"]:
                    if "主营产品" in item["kind"]:
                        item["kind"] = item["kind"].split('主营产品：')[-1]
                        item["kind"] = item["kind"].replace(' ', '|')
                        item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营', '', item["kind"])\
                            .replace('-', '|').replace('、','|').replace(',', '|').replace('，', '|')\
                            .replace(';', '|').replace('.', '').strip()
                    else:
                        item["kind"] = item["kind"]
                        item["kind"] = re.sub(r'\n|\s|\r|\t|主营业务：|主营产品:', '', item["kind"]) \
                            .replace('-', '|').replace('、', '|').replace(',', '|').replace('，', '|') \
                            .replace(';', '|').replace('.', '').strip()
                else:
                    item["kind"] = ''

                item["kind"] = self.cw.rinse_keywords(
                    self.cw.replace_ss(item["kind"]))

                if item["linkman"]:
                    item["linkman"] = item["linkman"].replace("联 系 人：", '')
                else:
                    item["linkman"] = ''
                item["linkman"] = self.cw.search_linkman(item["linkman"])

                if item["phone"]:
                    item["phone"] = item["phone"].replace("联 系 电 话：", '')
                    item["phone"] = self.cw.search_phone_num(item["phone"])
                else:
                    item["phone"] = ''

                if item["telephone"]:
                    item["telephone"] = item["telephone"].replace(
                        "联 系 电 话：", '')
                    item["telephone"] = self.cw.search_telephone_num(
                        item["telephone"])
                else:
                    item["telephone"] = ''

                if item["contact_Fax"]:
                    item["contact_Fax"] = item["contact_Fax"].replace(
                        "公 司 传 真：", '')
                    item["contact_Fax"] = self.cw.search_contact_Fax(
                        item["contact_Fax"])
                else:
                    item["contact_Fax"] = ''

                if item["E_Mail"]:
                    item["E_Mail"] = item["E_Mail"].replace("电 子 邮 箱：", '')
                    item["E_Mail"] = self.cw.search_email(item["E_Mail"])
                else:
                    item["E_Mail"] = ''

                if item["contact_QQ"]:
                    item["contact_QQ"] = self.cw.search_QQ(item["contact_QQ"])
                else:
                    try:
                        item["contact_QQ"] = self.cw.search_QQ(item["E_Mail"])
                    except:
                        item["contact_QQ"] = ''

                if item["company_address"]:
                    item["company_address"] = item["company_address"].replace(
                        '公 司 地 址：', '')
                    item["company_address"] = self.cw.search_address(
                        item["company_address"])
                else:
                    item["company_address"] = ''

                yield item

            except:
                return

    def get_md5(self, value):
        if value:
            return md5(value.encode()).hexdigest()
        return ''