Пример #1
0
 def __init__(self):
     self.starturl = 'http://www.80guakao.com/shengfen/hb/zhaopinxinxi/'
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'
     }
     self.f = FETCH()
     self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "80guakao_hb")
     self.r0 = Redisclient(0)
     self.r1 = Redisclient(1)
     self.r2 = Redisclient(2)
     self.r3 = Redisclient(3)
     self.category_name_list = []
     self.sec_category_dict = {}
     self.headers_forpage = {
         "Host": "www.80guakao.com",
         "Connection": "keep-alive",
         "Pragma": "no-cache",
         "Cache-Control": "no-cache",
         "User-Agent":
         "Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 86.0.4240.111Safari / 537.36",
         "Accept": "*/*",
         "Referer": "http://www.80guakao.com/shengfen/hb/",
         "Accept-Encoding": "gzip,deflate",
         "Accept-Language": "zh-CN,zh;q=0.9",
         "Cookie": "",
     }
Пример #2
0
class Scrapy78Pipeline:
    def open_spider(self, spider):
        # 爬虫开始时候执行
        # spider.hello = "world"  # 为spider对象动态添加属性,可以在spider模块中获取该属性值
        # 可以开启数据库等
        self.Mongo = MongoDB('mongodb://localhost', 'cuiworkdb', "78guakao_changsha")

    def process_item(self, item, spider):
        i = {}
        i['companyCity'] = "长沙"
        i['companyProvince'] = "湖南省"
        i['code'] = 'BUS_YT_ZZ'
        i['name'] = '资质'
        i['busCode'] = ''
        i['webUrl'] = '无'
        i['orgId'] = ''
        i['deptId'] = ''
        i['centreId'] = ''
        i["companyName"] = item["companyName"]
        i["outName"] = item["outName"]
        i["resourceRemark"] = item['resourceRemark']
        i["companyTel"] = str(item["companyTel"])
        i["ibossNum"] = None
        i['isDir'] = 0
        i['isShare'] = 0
        i['flag'] = 0
        i["_id"] = md5encryption(item["companyTel"])
        self.Mongo.mongo_add(i)
        print(i)
        return item
Пример #3
0
class Guakao555Pipeline(object):
    def open_spider(self,spider):
        #爬虫开始时候执行
        # spider.hello = "world"  # 为spider对象动态添加属性,可以在spider模块中获取该属性值
        # 可以开启数据库等
        self.Mongo=MongoDB('mongodb://localhost', 'cuiworkdb', "555guakao_dg")

    def process_item(self, item, spider):
        item['companyCity']= '杭州'
        item['code']= 'BUS_YT_ZZ'
        item['name']= '资质'
        item['busCode']= ''
        item['webUrl']='无'
        item['orgId']=''
        item['deptId']= ''
        item['centreId']= ''
        item["ibossNum"]=None
        item['isDir']= 0
        item['isShare']= 0
        item["_id"]= md5encryption(item["companyTel"])
        self.Mongo.mongo_add(item)
        return item

    def close_spider(self,spider):
        #爬虫结束时执行
        #可以关闭数据库
        pass
Пример #4
0
 def __init__(self):
     self.db = MongoDB('172.16.74.249:27017', 'db_reptile_company',
                       'company_name')
     self.redis = REDIS(host=RedisHost,
                        port=RedisPort,
                        password=RedisPassword,
                        db=RedisDB)
     self.item = {}
Пример #5
0
 def __init__(self):
     self.starturl = 'http://hangzhou.qd8.com.cn/'
     self.headers = {
         'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'}
     self.s = FETCH()
     self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "78guakao")
     self.r0 = Redisclient(0)
     self.r1 = Redisclient(1)
     self.r2 = Redisclient(2)
     self.r3 = Redisclient(3)
     self.item_dict = {}
     self.db = MongoDB('mongodb://localhost', 'cuiworkdb', 'kd8')
Пример #6
0
    def __init__(self, start_url, cookie, referer, companyCity,
                 companyProvince, db):
        self.start_url = start_url
        self.companyCity = companyCity
        self.companyProvince = companyProvince
        self.headers = {

            # ":authority":"www.cbi360.net",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding":
            "gzip,deflate,br",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "Cache-Control":
            "no-cache",
            "Content-Type":
            "application/x-www-form-urlencoded;charset=UTF-8",
            "Cookie":
            cookie,
            # "Cookie": "",
            "pragma":
            "no-cache",
            "sec-fetch-dest":
            "document",
            "sec-fetch-mode":
            "navigate",
            "sec-fetch-site":
            "same-origin",
            "sec-fetch-user":
            "******",
            "upgrade-insecure-requests":
            "1",
            "Referer":
            referer,
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
        }
        self.r0 = Redisclient(0)
        self.m = MongoDB('mongodb://localhost', 'cuiworkdb', db)
        # self.f = FETCH()
        self.par = re.compile(r'\d+-\d+')
        self.par2 = re.compile(r'\d+')
Пример #7
0
# # }
# #
# #
# # res = requests.post(url=text_url,headers=text_headers,data=pic_data,json=text_json_que)
# # print(res.text)

####2021/1/8 爬取对应数据图片url

import requests
from time import sleep
import json
from Func.client import MongoDB
from Func.fetchJX import FETCH
s = FETCH()
#改数据库
db = MongoDB('mongodb://localhost', 'cuiworkdb', "Shangbiao_GG-1731")
#改url
url = "http://wsgg.sbj.cnipa.gov.cn:9080/tmann/annInfoView/imageView.html?O56fzBVE=5cRVmWcgP6gC.1ulRszzI2_aOlP1jkNH0mxnPtgkE73P.1rSAtlAU1rHW64aHQoXm471Fzq7QOzRfVJiLnarbCbBAjmRHPnmNTUqx.Bfa6RWoAiipN6HKjl5E3Nb6Jp_LaGu5Dr0x1V4f2AsDjRza2LmDcNsd62msQ6SzqM646fK0XNFf.KzqSrexNQiIbLTdcX2wDPfCad.6G6Y4Pq28hw_OMDoIYVwZSvwH.emWD5UAVTbKi.mblyWCBYJOMZx5OMbUMWr05.V6JtgmG.usyr3_8OtVx8yHqisK54faJIdqZ5ofaDE4r6mjkZiGtqZZ96H_kqpPDS1WOjZMSlQGqQal8YnoPPDasrJ5lPkWyphiagHypaYQfoBfWUc3idLO"

#改cookie
pic_headers = {
    "Accept": "application/json, text/javascript, */*; q=0.01",
    "Accept-Encoding": "gzip,deflate",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cache-Control": "no-cache",
    "Connection": "keep-alive",
    "Content-Length": "52",
    "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    "Cookie":
    "_gscu_1645064636=76464150vsisqf48; _gscu_2023327167=76464150m8szyi25; _trs_uv=k9wi5ba1_4030_8pj2; Hm_lvt_d7682ab43891c68a00de46e9ce5b76aa=1597140998; UM_distinctid=174ae765669480-09ef6ad0f222cb-4353761-1fa400-174ae76566aa07; goN9uW4i0iKzS=5db92.A0J2CMY23basgx2TZ.mTIJ7lkLr89FeTJ1C0aRMHE_2AokqW2_4RJ42AQplsUcWhHGBKqZ3JYJcp..cRA; __jsluid_h=b6457e19fe1b05edea1f19ada75c9f46; tmas_cookie=2272.7688.15400.0000; 018f9ebcc3834ce269=09b16dacaa2e3b985f2267dc76e5ae8f; arp_scroll_position=0; goN9uW4i0iKzT=53cCT8DqzzRLqqqm67z1U8qKZLtDgJCjEPobnFeBoFOQEkx_Gy09SZfAajrh2D40V2DJdi_T6Yxefkk.TyC5jkWKRfroyI0Ty8DNR0q2gea8MtkfUvIUVuyOffOLFIesbBvkJ4FVJn0c2XCNKuJKF5uWYYxN.9fe9K5lzUFILMY4E2DDUzrR2u3s2n5yMTLQ3QYDyAIEHcwh9210LUxxFmRFBxLVwWtAcBV_6cTdtf3pc22FM8A8bg8AGXagoEJRxfL.Lj2tq4BK8Li.zsiPB6R; JSESSIONID=0000P2s6vou1xkNAr0uRAjbsxE9:1bm112s99",
    "Host": "wsgg.sbj.cnipa.gov.cn:9080",
Пример #8
0
class get_json(object):
    def __init__(self):
        self.db = MongoDB('172.16.74.249:27017', 'db_reptile_company',
                          'company_name')
        self.redis = REDIS(host=RedisHost,
                           port=RedisPort,
                           password=RedisPassword,
                           db=RedisDB)
        self.item = {}

    # mongdb--redeis
    def transfer(self):
        dd = self.db.mongo_find({})
        for i in dd:
            item = {}
            item['_id'] = i['_id']
            item['company_name'] = i['company_name']
            b = self.redis.add('coampanylidt', json.dumps(item))
            print('存入成功', b, item)

    # 百度企业信用基本信息
    def get_companydetails(self, company_name):
        res1 = s.fetch('https://xin.baidu.com/s?q={}&t=0'.format(
            parse.quote(company_name)))
        href_list = re.findall(r'{"pid":"(\S+)","entName":', res1.text)
        if len(href_list) != 0:
            details_href = 'https://xin.baidu.com//detail//compinfo?pid=' + href_list[
                0]
            # company = res1.html.xpath('//a[@class="zx-list-item-url"]/@title')[0]
            print(details_href)
            res = s.fetch(details_href)
            # print(res.text)
            exit()
            # 统一社会信用代码
            self.item['credit_code'] = res.html.xpath(
                '//td[contains(text(),"统一社会信用代码")]/following-sibling::td[1]/text()',
                first=True)
            # 客户公司注册时间
            self.item['register_time'] = res.html.xpath(
                '//*[@class="zx-detail-basic-table"]//td[contains(text(),"成立日期")]/following-sibling::td[1]/text()',
                first=True)
            # 客户公司注册金额 registerMoney
            self.item['register_money'] = res.html.xpath(
                '//td[contains(text(),"注册资本")]/following-sibling::td[1]/text()',
                first=True)
            # 客户所属行业
            self.item['industry'] = res.html.xpath(
                '//td[contains(text(),"所属行业")]/following-sibling::td[1]/text()',
                first=True)
            # 客户公司状态:正常/注销
            self.item['business_state'] = res.html.xpath(
                '//td[contains(text(),"经营状态")]/following-sibling::td[1]/text()',
                first=True)
            # 组织机构代码
            self.item['organization_code'] = res.html.xpath(
                '//td[contains(text(),"组织机构代码")]/following-sibling::td[1]/text()',
                first=True)
            # 工商注册号
            self.item['register_num'] = res.html.xpath(
                '//td[contains(text(),"工商注册号")]/following-sibling::td[1]/text()',
                first=True)
            # 法定代表人
            self.item['legal_man'] = res.html.xpath(
                '//td[contains(text(),"法定代表人")]/following-sibling::td[1]/text()',
                first=True)
            # 登记机关
            self.item['regist_organ'] = res.html.xpath(
                '//td[contains(text(),"登记机关")]/following-sibling::td[1]/text()',
                first=True)
            # 核准日期
            self.item['confirmtime'] = res.html.xpath(
                '//*[@class="zx-detail-basic-table"]//td[contains(text(),"审核/年检日期")]/following-sibling::td[1]/text()',
                first=True)
            # 营业期限
            self.item['business_timeout'] = res.html.xpath(
                '//*[@class="zx-detail-basic-table"]//td[contains(text(),"营业期限")]/following-sibling::td[1]/text()',
                first=True)
            # 企业类型
            self.item['register_address'] = res.html.xpath(
                '//*[@class="zx-detail-basic-table"]//td[contains(text(),"企业类型")]/following-sibling::td[1]/text()',
                first=True)
            # 企业地址
            self.item['registerAddress'] = res.html.xpath(
                '//*[@class="zx-detail-basic-table"]//td[contains(text(),"注册地址")]/following-sibling::td[1]/text()',
                first=True)
            # 经营范围
            self.item['business_scope'] = res.html.xpath(
                '//td[contains(text(),"经营范围")]/following-sibling::td[1]//@data-content',
                first=True)

            self.item['usedName'] = res.html.xpath(
                '//td[contains(text(),"曾用名")]/following-sibling::td[1]/text()',
                first=True)
            # 经营方式
            # self.item['operation'] = None
            # 来源网站
            self.item['web_source'] = 'https://xin.baidu.com/'
            # 公司名
            self.item['company_name'] = company_name
            # 来源网址
            self.item['company_url'] = details_href
            self.item['_id'] = hashlib.md5(
                (company_name).encode(encoding='utf-8')).hexdigest()
            self.item['web_update_time'] = time.strftime(
                "%Y-%m-%d", time.localtime(int(time.time())))
            # print(self.item)
            # return self.item
            # code 201
            if company_name != company and len(self.item) > 4:
                self.db.mong_find_one_update({"_id": self.item['_id']},
                                             {"flag": "公司名有问题"})
                return '公司名有问题 --- %s' % self.item['company_name']
            else:
                db1.mongo_add(self.item)
                return '%s 插入成功 !!!!' % self.item['company_name']

        else:
            _id = hashlib.md5(
                (company_name).encode(encoding='utf-8')).hexdigest()
            self.db.mong_find_one_update({"_id": _id}, {"flag": "未找到匹配的公司名"})
            return '未找到匹配的公司名---%s' % company_name
Пример #9
0
#测试文件

#2021/1/11 下载图片,识别图片,存入mongo
import requests
from Func.client import MongoDB
from Func.fetchJX import FETCH
from PIL import Image as image_P
import pytesseract
import cv2
import openpyxl
import os

db = MongoDB('mongodb://localhost', 'cuiworkdb', "Shangbiao_GG-1726")
s = FETCH()
# 图片识别并写入excel模块
# 将“下载”文件夹中的图片按照数字顺序获取
# 将获取图片切割成小块图片进行图文转换
# 转换出的文字再进行辨识度提高处理
# 将图片数据转换成文字数据导入Mongo

###向excel中写入撤销和答辩种类

# 撤销复审决定书
rec_list1 = [
    '发文菩暂菖= 撤锏复市决定书',
    '发文茎鲤= 撤销复市决定书',
    '发文莒鲤= 撒淌氯轲决定书',
]

# 关于撤销连续三年未使用商标的决定
rec_list2 = [
Пример #10
0
#给没有flag字段的表添加flag=1,表应该是已推送的表,后续做增量爬取,(只做第一次)
#给已经推送的表改变flag=1
from Func.client import MongoDB

#只做一次
m = MongoDB('mongodb://localhost', 'cuiworkdb', '9guakao_zhengzhou')
m.add_field_for_all()

#改变flag=0为flag=1
m = MongoDB('mongodb://localhost', 'cuiworkdb', 'jianzhutong_beijing')
m.change_flag()



Пример #11
0
#         #     return '程序错误'

import hashlib
import json
import re
import time
from concurrent.futures.thread import ThreadPoolExecutor
from urllib import parse

from Func.Parse import FETCH
from Func.client import MongoDB
from Func.conf import *
from Func.Redis import REDIS

s = FETCH()
db1 = MongoDB('172.16.74.249:27017', 'db_reptile_company', 'company_details')


class get_json(object):
    def __init__(self):
        self.db = MongoDB('172.16.74.249:27017', 'db_reptile_company',
                          'company_name')
        self.redis = REDIS(host=RedisHost,
                           port=RedisPort,
                           password=RedisPassword,
                           db=RedisDB)
        self.item = {}

    # mongdb--redeis
    def transfer(self):
        dd = self.db.mongo_find({})
Пример #12
0
 def open_spider(self, spider):
     # 爬虫开始时候执行
     # spider.hello = "world"  # 为spider对象动态添加属性,可以在spider模块中获取该属性值
     # 可以开启数据库等
     self.Mongo = MongoDB('mongodb://localhost', 'cuiworkdb', "78guakao_changsha")
Пример #13
0
#从增量库里面导出flag=0的数据到clues表进行接口导入
from Func.client import MongoDB
# 导入接口(等号后接表名,数据库为)  https://dqk.dgg188.cn/api/import/import_data?ip=10.2.1.122:17017&docName=
import requests

m = MongoDB('mongodb://localhost', 'cuiworkdb', 'jianzhutong_hubei')
m2 = MongoDB('10.2.1.121:17017', 'clues_resources', "BMD20210129-zhijiazhuang")
all_data = m.find_many("flag", 0)
for one in all_data:
    m2.mongo_add(one)
m2.del_field()

#导入接口
# dbname='jianzhutong_guangzhou'
# data={"ip": "10.2.1.122:17017","docName":dbname}
# headers={
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
# "Accept-Encoding": "gzip, deflate, br",
# "Accept-Language": "zh-CN,zh;q=0.9",
# "Cache-Control": "no-cache",
# "Connection": "keep-alive",
# "Cookie": "sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22172cc1339f4917-0029d8b329026-4353761-2073600-172cc1339f5818%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%22172cc1339f4917-0029d8b329026-4353761-2073600-172cc1339f5818%22%7D",
# "Host": "dqk.dgg188.cn",
# "Pragma": "no-cache",
# "Sec-Fetch-Dest": "document",
# "Sec-Fetch-Mode": "navigate",
# "Sec-Fetch-Site": "none",
# "Sec-Fetch-User":"******",
# "Upgrade-Insecure-Requests":"1",
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"
# }
Пример #14
0
#去重策略文件
from Func.client import MongoDB
from md5encode import is_phone
m1 = MongoDB('mongodb://localhost', 'cuiworkdb', "9guakao_zhengzhou")
m2 = MongoDB('10.2.1.121:17017', 'clues_resources', "BMD_sort")
m3 = MongoDB('10.2.1.121:17017', 'clues_resources', "test")
m4 = MongoDB('10.2.1.121:17017', 'clues_resources', "BMD_20210205_push")
m5 = MongoDB('10.2.1.121:17017', 'clues_resources', "test")

# m5 = MongoDB('10.2.1.121:17017', 'clues_resources', "jianzhutong_shengzheng")
# m5 = MongoDB('10.2.1.121:17017', 'clues_resources', "BMD20201224-4")
# 导入clues,只有这个才能推送数据
# m1.mong_find_many_updata({"companyCity": "成都"}, {"isDir": 0})

#去重策略1  将小表全部聚集到BMD_sort大表中做_id去重
# all_data = m1.find_all()
# for i in all_data:
#     m2.mongo_add(i)

# 去重策略2  同名公司名去重策略
#第一版弃用
# list_data = list(m2.find_all())
# for i in range(len(list_data)):
#    for k in range(i+1, len(list_data)):
#        if list_data[i]["companyName"] == list_data[k]["companyName"]:
#            list_data[k]["companyName"] = "None"

# list_company_name = []
# list_data = m2.find_all()
# for i in list_data:
#     list_company_name.append(i["companyName"])
Пример #15
0
class Gkspider:
    def __init__(self):
        self.starturl = 'http://www.80guakao.com/shengfen/hb/zhaopinxinxi/'
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'
        }
        self.f = FETCH()
        self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "80guakao_hb")
        self.r0 = Redisclient(0)
        self.r1 = Redisclient(1)
        self.r2 = Redisclient(2)
        self.r3 = Redisclient(3)
        self.category_name_list = []
        self.sec_category_dict = {}
        self.headers_forpage = {
            "Host": "www.80guakao.com",
            "Connection": "keep-alive",
            "Pragma": "no-cache",
            "Cache-Control": "no-cache",
            "User-Agent":
            "Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 86.0.4240.111Safari / 537.36",
            "Accept": "*/*",
            "Referer": "http://www.80guakao.com/shengfen/hb/",
            "Accept-Encoding": "gzip,deflate",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Cookie": "",
        }

    def get_category(self):
        html = self.f.fetch(url=self.starturl,
                            headers=self.headers,
                            method='get')
        # html = requests.get(url=self.starturl, headers=self.headers)
        sleep(random.randint(0, 1))
        res = etree.HTML(html.text)
        # print(html.text)

        # category_url_list = res.xpath('//div[@class="content"]//div//a')
        # # if len(category_url_list) > 19:
        # category_url_list = res.xpath('//div[@class="inner"][1]//ul[1]//a')
        category_url_list = res.xpath(
            '//div[@class="categories"]//ul//li[1]//dd[1]//a')
        for i in category_url_list:
            category_name = i.xpath('./text()')[0]
            category_url = i.xpath('./@href')[0]
            category_url = category_url.replace('m.', 'www.')
            if category_name != "不限":
                self.r0.save_category_url(category_name, category_url)
                self.category_name_list.append(category_name)

    def get_sec_category(self):
        for category_name in self.category_name_list:

            url = self.r0.get_category_url(category_name)
            # html = self.f.fetch(url=url,headers=self.headers,method='get')
            html = requests.get(url=url, headers=self.headers_forpage)
            sleep(random.randint(0, 1))
            res = etree.HTML(html.text)

            sec_category_list = res.xpath('//div[@class="content"]//div//a')
            # sec_category_list = res.xpath('//div[@class="inner"][1]//ul//a')

            for i in sec_category_list:
                sec_category_name = i.xpath('./text()')[0]
                sec_category_url = i.xpath('./@href')[0]
                sec_category_url = sec_category_url.replace('m.', 'www.')
                if sec_category_name != '不限':
                    print(sec_category_name)
                    self.r1.save_one_dict(category_name, sec_category_name,
                                          sec_category_url)

    def get_all_page(self):
        for category in self.category_name_list:
            sec_category_list = self.r1.get_keys(category)

            for sec_category_name, url in sec_category_list.items():
                # html = self.f.fetch(url=url.decode(),headers=self.headers_forpage,method='get')
                html = requests.get(url=url.decode(),
                                    headers=self.headers_forpage)
                sleep(random.randint(0, 1))
                res = etree.HTML(html.text)
                self.r2.save_page_url(
                    category + ":" + sec_category_name.decode(), url.decode())
                while True:
                    try:
                        next_page = res.xpath(
                            '//div[@class="pagination2"]//a[contains(text(),"下一页")]/@href'
                        )[0]
                    except:
                        break
                    if not next_page:
                        break

                    self.r2.save_page_url(
                        category + ":" + sec_category_name.decode(), next_page)
                    html_next = self.f.fetch(url=next_page,
                                             headers=self.headers_forpage,
                                             method='get')
                    # html_next = requests.get(url=next_page, headers=self.headers_forpage)
                    sleep(random.randint(0, 1))
                    res = etree.HTML(html_next.text)

    def get_item_url(self):
        for category in self.category_name_list:
            sec_category_list = self.r1.get_keys(category)
            for sec_category_name in sec_category_list:
                while True:
                    try:
                        url = self.r2.get_page_url(category + ":" +
                                                   sec_category_name.decode())
                        # html = self.f.fetch(url=url, headers=self.headers,method='get')
                        html = requests.get(url=url,
                                            headers=self.headers_forpage)
                        sleep(random.randint(1, 2))
                        res = etree.HTML(html.text)
                    except Exception as e:
                        print('error:', e)
                        break
                    # item_list = res.xpath('//li[@class="Tz"]//child::*/a/@href')
                    item_list = res.xpath(
                        '/html/body/div[7]/div[5]/div/div[6]/div[4]/div[3]/ul/div/span[1]/a/@href'
                    )

                    for item_url in item_list:
                        # if 'tel' not in item_url:
                        #     url = item_url.replace('m.', 'www.') #每个数据url
                        if 'http' not in item_url:
                            item_url = 'http://www.80guakao.com/' + item_url
                        self.r3.save_item_url(
                            category + ':' + sec_category_name.decode(),
                            item_url)

    def get_info(self):
        # print(res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"]/text()')[0]) #公司名
        # print(res.xpath('//ul[@class="attr_info bottom"]//li//span[@class="attrVal"]//a/text()')[0]) #电话
        # print(res.xpath('//ul[@class="attr_info bottom"]//li//span[@class="attrVal"]/text()')[0])  # 姓名
        for category in self.category_name_list:
            sec_category_list = self.r1.get_keys(category)
            for sec_category_name in sec_category_list:
                while True:
                    try:
                        url = self.r3.get_item_url(category + ":" +
                                                   sec_category_name.decode())

                        html = requests.get(url=url.decode(),
                                            headers=self.headers_forpage)
                        sleep(random.randint(0, 1))
                        if html.status_code != 200:
                            html = self.f.fetch(url=url.decode(),
                                                headers=self.headers_forpage,
                                                method='get')
                            sleep(random.randint(0, 1))
                        res = etree.HTML(html.text)

                    except:
                        break
                    item = {}
                    # try:
                    #     company_name = res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"][1]/text()')[0]
                    # except:
                    try:
                        company_name = res.xpath(
                            '//div[@class="zhaopiner"]//li//span[contains(text(),"公司名称")]/parent::li/text()'
                        )[0]

                    except:
                        company_name = 'None'

                    # try:
                    #     contact_people = res.xpath('//ul[@class="attr_info bottom"]//li[2]//span[@class="attrVal"]/text()')[0]
                    #     contact_people = contact_people.replace(r'\xa0\xa0','')
                    #
                    # except:
                    contact_people = res.xpath(
                        '//ul[@class="contacter"]//li//font/text()')[0]

                    # try:
                    #     perf_request = res.xpath('//div[@class="zhaopiner"]//li//span[contains(text(),"专业要求")]/parent::li/text()')[0]
                    # except:
                    #
                    #     perf_request = res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"][11]//text()')[0]
                    #

                    # try:
                    #     phone = res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"][11]//a/text()')[0]
                    #     if phone == []:
                    #         raise  Exception
                    # except:

                    # try:
                    phone_url_re = res.xpath(
                        '//ul[@class="contacter"]//li[@class="qqbm"]/a/@onclick'
                    )[0]

                    par = re.compile("'.*?'")
                    phone_url = re.findall(par, phone_url_re)[1].replace(
                        "'", "")  # 电话号码url

                    if type(phone_url) == str:
                        html = requests.get(url=phone_url,
                                            headers=self.headers_forpage)
                    else:
                        html = requests.get(url=phone_url.decode(),
                                            headers=self.headers_forpage)
                        sleep(random.randint(0, 1))
                    res = etree.HTML(html.text)
                    phone = res.xpath(
                        '//div[@class="number"]//span[@class="num"]/text()')[0]
                    # except:
                    #     phone = "None"

                    item['companyCity'] = '宜昌'
                    item['companyProvince'] = '湖北省'
                    item['code'] = 'BUS_YT_ZZ'
                    item['name'] = '资质'
                    item['busCode'] = ''
                    item['webUrl'] = '无'
                    item['orgId'] = ''
                    # 部门ID 字符串
                    item['deptId'] = ''
                    # 中心ID 字符串
                    item['centreId'] = ''
                    # item["first_category"] = category
                    # item["sec_category"] = sec_category_name.decode()
                    item["companyName"] = company_name
                    item["outName"] = contact_people
                    item[
                        "resourceRemark"] = category + ":" + sec_category_name.decode(
                        )
                    item["companyTel"] = phone.strip()
                    if len(contact_people) == 11:
                        item["companyTel"] = contact_people
                    item["ibossNum"] = None
                    item['isDir'] = 0
                    item['isShare'] = 0
                    item["_id"] = md5encryption(item["companyTel"])
                    print(item)
                    self.m.mongo_add(item)

    def test(self):
        url = 'http://www.80guakao.com/shengfen/sc/gonglugongcheng/23988.html'
        html = requests.get(url=url, headers=self.headers_forpage)
        print(html.text)
        res = etree.HTML(html.text)
        # print(res.xpath('//div[@class="pagination2"]//a[contains(text(),"下一页")]/@href'))
        # print(res.xpath('//div[@class="content"]//div//a/text()'))
        # print(html.text)
        # print(res.xpath('/html/body/div[7]/div[5]/div/div[6]/div[4]/div[3]/ul/div/span/a/@href'))
        # print(res.xpath('//div[@class="zhaopiner"]//li//span[contains(text(),"公司名称")]/parent::li/text()')[0]) #公司名称
        # print(res.xpath('//div[@class="zhaopiner"]//li//span[contains(text(),"专业要求")]/parent::li/text()')) #专业要求
        # print(res.xpath('//ul[@class="contacter"]//li//font/text()')[0]) #联系人
        phone_url_re = res.xpath(
            '//ul[@class="contacter"]//li[@class="qqbm"]/a/@onclick')[0]  #电话号码

        print(phone_url_re)
        par = re.compile("'.*?'")
        phone_url = re.findall(par, phone_url_re)[1].replace("'", "")  #电话号码url
        html = requests.get(url=phone_url, headers=self.headers_forpage)
        res = etree.HTML(html.text)
        phone = res.xpath(
            '//div[@class="number"]//span[@class="num"]/text()')[0]
        print(phone)
        #Request URL: http://www.80guakao.com/box.php?part=seecontact_tel&id=54336&tel_base64=MTk5NTA0NTk5Mjc=
        # print(res.xpath('/html/body/div[7]/div[5]/div/div[6]/div[4]/div[3]/ul/div/span[1]/a/@href'))

    def run(self):
        self.get_category()
        self.get_sec_category()
        self.get_all_page()
        self.get_item_url()
        self.get_info()
Пример #16
0
import hashlib
import json
import time
from urllib import parse

from Func.Parse import FETCH
from Func.Tyc3 import FETCH1
from lxml import etree
from Func.client import MongoDB

s = FETCH()
s1 = FETCH1()
db = MongoDB('172.16.74.249:27017', 'creditCode', 'KJ')


class GetCode(object):
    def __init__(self):
        self.item = {}
        self.item1 = {}  # 信用中国
        self.item2 = {}  # 天眼查
        self.item3 = {}
        self.code = {
            '200': '请求成功',
            '203': '请求失败,please重试',
            '204': '所查询公司不精确或不存在'
        }

    def Mongo(self, Item):
        db.mongo_add(Item)

    # 百度企业信用基本信息
Пример #17
0
class JanzhuSpider():
    def __init__(self, start_url, cookie, referer, companyCity,
                 companyProvince, db):
        self.start_url = start_url
        self.companyCity = companyCity
        self.companyProvince = companyProvince
        self.headers = {

            # ":authority":"www.cbi360.net",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding":
            "gzip,deflate,br",
            "Accept-Language":
            "zh-CN,zh;q=0.9",
            "Cache-Control":
            "no-cache",
            "Content-Type":
            "application/x-www-form-urlencoded;charset=UTF-8",
            "Cookie":
            cookie,
            # "Cookie": "",
            "pragma":
            "no-cache",
            "sec-fetch-dest":
            "document",
            "sec-fetch-mode":
            "navigate",
            "sec-fetch-site":
            "same-origin",
            "sec-fetch-user":
            "******",
            "upgrade-insecure-requests":
            "1",
            "Referer":
            referer,
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
        }
        self.r0 = Redisclient(0)
        self.m = MongoDB('mongodb://localhost', 'cuiworkdb', db)
        # self.f = FETCH()
        self.par = re.compile(r'\d+-\d+')
        self.par2 = re.compile(r'\d+')

    def parse_next_page(self):
        self.r0.save_page_url(category_name="北京", page_url=self.start_url)
        # html = self.f.fetch(url=self.start_url, headers=self.headers, method='get')
        html = requests.get(url=self.start_url, headers=self.headers)
        sleep(2)
        while True:
            res = etree.HTML(html.text)
            try:
                next_page = res.xpath(
                    '//ul[@class="pagination"]//li//a[contains(text(),"下一页")]/@href'
                )
                print(next_page)
                next_page = 'https://www.cbi360.net' + next_page[0]
            except Exception as e:
                print(e)
                print(html.text)
                break
            self.r0.save_page_url(category_name="北京", page_url=next_page)
            self.parse_item(res)
            # html = self.f.fetch(url=next_page, headers=self.headers, method='get')
            html = requests.get(url=next_page, headers=self.headers)
            sleep(1)

    def re_phone(self, target):
        try:
            phone = re.findall(self.par, target)[0]
        except:
            print(target)
            try:
                phone = re.findall(self.par2, target)[0]
            except:
                phone = ''
        return phone

    def parse_item(self, res):
        # //dl[@class="table—con-bottom clear"]//dd[@class="w-18"][2]
        # while True:
        # try:
        #     # url = self.r0.get_page_url(category_name='北京')
        #     # html = self.f.fetch(url=url, headers=self.headers, method='get')
        #     # html = requests.get(url=url, headers=self.headers)
        # except:
        #     continue
        sleep(1)
        # res = etree.HTML(html.text)
        companyName_list = res.xpath(
            '//ul[@class="table-con-top clear search-word"]//li[@style]//preceding-sibling::* //a[@target="_blank"]/text()'
        )
        phone_list = res.xpath(
            '//dl[@class="table—con-bottom clear"]//dd[@class="w-18"][2]/text()'
        )
        for i in range(len(companyName_list)):
            item = {}
            companyName = companyName_list[i]
            phone = self.re_phone(phone_list[i])
            if is_phone(phone):
                item['companyCity'] = self.companyCity
                item['companyProvince'] = self.companyProvince
                item['code'] = 'BUS_YT_ZZ'
                item['name'] = '资质'
                item['busCode'] = ''
                item['webUrl'] = '无'
                item['orgId'] = ''
                item['deptId'] = ''
                item['centreId'] = ''
                item["companyName"] = companyName
                item["outName"] = ''
                item["resourceRemark"] = ''
                item["companyTel"] = phone
                item["ibossNum"] = None
                item['isDir'] = 0
                item['isShare'] = 0
                item["_id"] = md5encryption(item["companyTel"])
                item["flag"] = 0
                print(item)
                self.m.mongo_add(item)

    def run(self):
        self.parse_next_page()
Пример #18
0
#2021/1/11 下载图片,识别图片,存入mongo
import requests
from Func.client import MongoDB
from Func.fetchJX import FETCH
from PIL import Image as image_P
import pytesseract
import cv2
import openpyxl
import os
import sys

db = MongoDB('mongodb://localhost', 'cuiworkdb', "Shangbiao_GG-1731")
s = FETCH()

# 图片识别并写入excel模块
# 将“下载”文件夹中的图片按照数字顺序获取
# 将获取图片切割成小块图片进行图文转换
# 转换出的文字再进行辨识度提高处理
# 将图片数据转换成文字数据导入Mongo

###向excel中写入撤销和答辩种类

# 撤销复审决定书
rec_list1 = [
    '发文菩暂菖= 撤锏复市决定书',
    '发文茎鲤= 撤销复市决定书',
    '发文莒鲤= 撒淌氯轲决定书',
]

# 关于撤销连续三年未使用商标的决定
rec_list2 = [
Пример #19
0
from Func.client import MongoDB

m1 = MongoDB('mongodb://localhost', 'cuiworkdb', "jianzhutong_shanghai")
m2 = MongoDB('10.2.1.121:17017', 'clues_resources', "jianzhutong_shanghai")
# m3 = MongoDB('10.2.1.121:17017', 'clues_resources', "jianzhutong_guangzhou")
# m4 = MongoDB('10.2.1.121:17017', 'clues_resources', "jianzhutong_foshan")
# m5 = MongoDB('10.2.1.121:17017', 'clues_resources', "jianzhutong_shengzheng")

# m5 = MongoDB('10.2.1.121:17017', 'clues_resources', "BMD20201224-4")
# 导入clues,只有这个才能推送数据

# m1.mong_find_many_updata({"companyCity": "成都"}, {"isDir": 0})

# all_data = m1.find_all()
# for i in all_data:
#     print(i)

count = 0
gd_data = m1.find_all()
for i in gd_data:
    # if count <= 3000:
    m2.mongo_add(i)
# elif count <= 6000:
#     m3.mongo_add(i)
# elif count<=9000:
#     m4.mongo_add(i)
# else:
#     m5.mongo_add(i)
# count += 1
#all_data = m3.find_all()
Пример #20
0
#空号检测文件
from Func.client import MongoDB

db = MongoDB('10.2.1.121:17017', 'clues_resources', "BMD_sort")
db2 = MongoDB('mongodb://localhost', 'cuiworkdb', 'BMD20210201-chengdu-check')

all_data = db.find_all()
# all_data=db.find_many("flag",0)
#mongo 电话数据导出为txt,拿去空号检测
with open('BMD_sort', 'w') as f:
    for i in all_data:
        phone = i["companyTel"]
        f.write(str(phone) + '\n')

#将活跃号数据导入xxxx-check表
# huoyue_list = []
# with open('活跃号(实号).txt','r',encoding='utf-8') as f:
#     for line in f:
#         huoyue_list.append(line.strip())
#
# for i in all_data:
#     if i["companyTel"] in huoyue_list:
#         db2.mongo_add(i)