示例#1
0
 def get_info(self, rc_info_li, com_id, page):  #解析详情页面代码,获取所需字段
     count = (page - 1) * 10
     for nbr, info in enumerate(rc_info_li, 1):
         count += 1
         job_id = info.xpath('td[3]/a/@href')[0].split(
             'jobdetail_')[1].strip()
         rc_num = info.xpath('td[1]/text()')[0].strip()
         pub_date = info.xpath('td[2]/text()')[0].strip()
         rc_job = info.xpath('td[3]/a/text()')[0].strip()
         salary = info.xpath('td[4]/text()')[0].strip()
         education = info.xpath('td[5]/text()')[0].strip()
         we = info.xpath('td[6]/text()')[0].strip()
         city = info.xpath('td[7]/text()')[0].strip()
         # print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format('-' * 9, count, page, count_page))
         print('\n{0}--总第{1}条----第{2}页----{0}\n'.format(
             '-' * 9, count, page))
         localtime = tm().get_localtime()  # 当前时间
         create_time = localtime
         print(f'当前时间:{create_time}')
         print(
             f'公司ID:{com_id}\n序号:{rc_num}\n岗位ID:{job_id}\n岗位名称:{rc_job}\n发布时间:{pub_date}\n'
             f'薪资:{salary}\n学历:{education}\n工作经历:{we}\n城市:{city}\n')
         ins = f"""
         INSERT INTO com_recruit 
         (com_id,job_id,rc_num,pub_date,rc_job,
         salary,education,we,city,create_time)
         VALUES 
         ("{com_id}","{job_id}","{rc_num}","{pub_date}","{rc_job}",
         "{salary}","{education}","{we}","{city}","{create_time}");
         """
         db().inssts(ins)
     return count
示例#2
0
 def parse_info(self, com_id, tree):  #解析页面内容,获取相关数据
     if tree == None:
         print('无相关数据!')
     else:
         member_li = tree.xpath(
             '//section[@id="Mainmember"]/table[contains(@class,"ntable ntable-odd")]/tr[position()>1]'
         )
         count = 0
         for member_info in member_li:
             count += 1
             member_num = member_info.xpath('td[1]/text()')[0].strip()
             member_name = member_info.xpath(
                 'td[2]//*[@class="seo font-14"]/text()')[0].strip()
             member_post = member_info.xpath('td[3]/text()')[0].strip()
             localtime = tm().get_localtime()  # 当前时间
             create_time = localtime
             print('\n{0}--总第{1}条----{0}\n'.format('-' * 9, count))
             print(f'当前时间:{create_time}')
             print(
                 f'公司ID:{com_id}\n序号:{member_num}\n姓名:{member_name}\n职务:{member_post}\n'
             )
             ins = f"""
             INSERT INTO `com_main_member`
             (com_id,member_num,member_name,member_post,create_time)
             VALUES 
             ("{com_id}","{member_num}","{member_name}","{member_post}","{create_time}");
             """
             self.db.inssts(ins)
示例#3
0
 def __init__(self):
     self.db = db()
     self.dk = dk()
     self.gh = gh()
     self.tm = tm()
     self.gm = gm()
     self.index_url = 'https://www.qcc.com'
 def cc_judge(self):
     global com_id, com_name
     alb = AdmLicenseCc()
     count_cc = 0
     count = 0
     while count_cc == 0:
         result = alb.adm_license_judge()
         com_id = result[0]
         com_name = result[1]
         key = dk().search_key(com_name)
         if com_id == None:
             pass
         else:
             count += 1
             com_url = f'https://www.qcc.com/firm_{com_id}.html'
             hds = gh().header()
             time.sleep(random.randint(3, 5))
             res = requests.get(com_url, headers=hds).text
             if '<script>window.location.href' in res:
                 print('访问频繁,需验证!{cc_judge}')
                 input('暂停')
             elif '<script>location.href="/user_login"</script>' in res:
                 print('Cookie失效,需更换!{cc_judge}')
                 input('程序暂停运行!')
             elif '您的账号访问超频,请稍后访问或联系客服人员' in res:
                 print('账号访问超频,请更换账号!{cc_judge}')
                 input('程序暂停运行!')
             else:
                 tree = etree.HTML(res)
                 try:
                     count_cc = tree.xpath(
                         '//div[@class="tcaption"]/h3[contains(text(),"[信用中国]")]/following-sibling::span[1]/text()'
                     )[0]
                     count_cc = int(count_cc)
                 except:
                     count_cc = 0
                 localtime = tm().get_localtime()  # 当前时间
                 print(localtime)
                 if count_cc == 0:
                     print(f'计数器:{count}\n公司ID:{com_id}\n行政许可信息[工商局]条数:无')
                 else:
                     print(
                         f'计数器:{count}\n公司ID:{com_id}\n行政许可信息[工商局]条数:{count_cc}'
                     )
                 status_column = 'status_credit_adm_license_cc'  # 表字段名
                 count_column = 'count_credit_adm_license_cc'  # 表字段名
                 cd.upd_status(com_id, status_column, count_column,
                               count_cc)
     return com_id, com_name, count_cc
示例#5
0
 def parse_info(self,tree,com_id,com_name,page,sh_page_count):
     count = (page - 1) * 50
     if tree == None:
         print('无相关数据!\n')
     else:
         stockholder_li = tree.xpath('//table[contains(@class,"ntable ntable-odd npth")]/tr[position()>1]')
         for stockholder_info in stockholder_li:
             count += 1
             stockholder_num = stockholder_info.xpath('td[1]/text()')[0].strip()
             stockholder_name = stockholder_info.xpath('td[2]//*[@class="seo font-14"]/text()')[0].strip()
             stockholder_rate = stockholder_info.xpath('td[3]/text()')[0].strip()
             subscribed_capital_amount = stockholder_info.xpath('td[4]/text()')[0].strip()
             subscribed_capital_date = stockholder_info.xpath('td[5]/text()')[0].strip()
             try:
                 contributed_capital_amount = stockholder_info.xpath('td[6]/text()')[0].strip()
             except:
                 contributed_capital_amount = '--'
             try:
                 contributed_capital_date = stockholder_info.xpath('td[7]/text()')[0].strip()
             except:
                 contributed_capital_date = '--'
             try:
                 relation_product = stockholder_info.xpath('td[8]/text()')[0].strip()
             except:
                 try:
                     relation_product = stockholder_info.xpath('td[8]/a/text()')[0].strip()
                 except:
                     relation_product = '--'
             localtime = tm().get_localtime()  # 当前时间
             create_time = localtime
             print('\n{0}--总第{1}条----第{2}/{3}页----{0}\n'.format('-' * 9, count, page,sh_page_count))
             print(f'公司ID:{com_id}\n公司名称:{com_name}')
             print(f'序号:{stockholder_num}\n股东:{stockholder_name}\n持股比例:{stockholder_rate}\n认缴出资额:{subscribed_capital_amount}\n认缴出资日期:{subscribed_capital_date}\n'
                   f'实缴出资额:{contributed_capital_amount}\n实缴出资日期:{contributed_capital_date}\n关联产品/机构:{relation_product}\n')
             ins = f"""
             INSERT INTO `com_stockholder`
             (com_id,stockholder_num,stockholder_name,stockholder_rate,subscribed_capital_amount,
             subscribed_capital_date,contributed_capital_amount,contributed_capital_date,relation_product,create_time)
             VALUES 
             ("{com_id}","{stockholder_num}","{stockholder_name}","{stockholder_rate}","{subscribed_capital_amount}",
             "{subscribed_capital_date}","{contributed_capital_amount}","{contributed_capital_date}","{relation_product}","{create_time}");
             """
             # udp = f"""
             # UPDATE `com_info`
             # SET `status_stockholder` = "9"
             # AND `count_stockholder` = "{count_sh}"
             # WHERE `com_id` = "{com_id}";"""
             db().inssts(ins)
 def adm_license_judge(self):  # 判断行政许可信息,如果有记录则执行解析,返回该公司相关信息
     global com_id, com_name
     al = AdmLicense()
     count_adm_license = 0
     count = 0
     while count_adm_license == 0 or count_adm_license == -1:
         result = al.get_com_id()
         com_id = result[0]
         com_name = result[1]
         if com_id == None:
             pass
         else:
             count += 1
             com_url = f'https://www.qcc.com/firm_{com_id}.html'
             hds = gh().header()
             time.sleep(random.randint(3, 5))
             res = requests.get(com_url, headers=hds).text
             if '<script>window.location.href' in res:
                 print('访问频繁,需验证!{adm_license_judge}')
                 input('暂停')
             elif '<script>location.href="/user_login"</script>' in res:
                 print('Cookie失效,需更换!{adm_license_judge}')
                 input('程序暂停运行!')
             elif '您的账号访问超频,请稍后访问或联系客服人员' in res:
                 print('账号访问超频,请更换账号!{adm_license_judge}')
                 input('程序暂停运行!')
             else:
                 tree = etree.HTML(res)
                 try:
                     count_adm_license = tree.xpath(
                         '//div[@class="company-nav-items"]/span[contains(text(),"行政许可")]/span/text()|//div[@class="company-nav-items"]/a[@data-pos="licenslist"]/span/text()'
                     )[0]
                     count_adm_license = int(count_adm_license)
                 except:
                     count_adm_license = -1
                 localtime = tm().get_localtime()  # 当前时间
                 print(localtime)
                 if count_adm_license == 0 or count_adm_license == -1:
                     print(f'计数器:{count}\n公司ID:{com_id}\n行政许可信息条数:无')
                 else:
                     print(
                         f'计数器:{count}\n公司ID:{com_id}\n行政许可信息条数:{count_adm_license}'
                     )
                 status_column = 'status_credit_adm_license'  #表字段名
                 count_column = 'count_credit_adm_license'  #表字段名
                 al.upd_status(com_id, status_column, count_column,
                               count_adm_license)
     return com_id, com_name, count_adm_license
 def faith_execued_judge(self):  #判断失信被执行人信息,如果有记录则执行解析,返回该公司相关信息
     global com_id, com_name
     fe = FaithExecued()
     count_breach_of_faith_execued = 0
     count = 0
     while count_breach_of_faith_execued == 0 or count_breach_of_faith_execued == -1:
         result = fe.get_com_id()
         com_id = result[0]
         com_name = result[1]
         if com_id == None:
             pass
         else:
             count += 1
             com_url = f'https://www.qichacha.com/firm_{com_id}.html'
             hds = gh().header()
             time.sleep(random.randint(3, 5))
             res = requests.get(com_url, headers=hds).text
             if '<script>window.location.href' in res:
                 print('访问频繁,需验证!{faith_execued_judge}')
                 input('暂停')
             elif '<script>location.href="/user_login"</script>' in res:
                 print('Cookie失效,需更换!{faith_execued_judge}')
                 input('程序暂停运行!')
             elif '您的账号访问超频,请稍后访问或联系客服人员' in res:
                 print('账号访问超频,请更换账号!{faith_execued_judge}')
                 input('程序暂停运行!')
             else:
                 tree = etree.HTML(res)
                 try:
                     count_breach_of_faith_execued = tree.xpath(
                         '//div[@class="company-nav-items"]/span[contains(text(),"失信信息")]/span/text()|//div[@class="company-nav-items"]/a[@data-pos="shixinlist"]/span/text()'
                     )[0]
                     count_breach_of_faith_execued = int(
                         count_breach_of_faith_execued)
                 except:
                     count_breach_of_faith_execued = -1
                 localtime = tm().get_localtime()  # 当前时间
                 print(localtime)
                 if count_breach_of_faith_execued == 0 or coucount_breach_of_faith_execuednt_execued == -1:
                     print(f'计数器:{count}\n公司ID:{com_id}\n失信被执行人信息条数:无')
                 else:
                     print(
                         f'计数器:{count}\n公司ID:{com_id}\n失信被执行人信息条数:{count_breach_of_faith_execued}'
                     )
                 cd.upd_status_execued(com_id,
                                       count_breach_of_faith_execued)
     return com_id, com_name, count_breach_of_faith_execued
示例#8
0
 def get_count_rc(self, count_rc, key, count):  #根据模糊判断,到招聘详情页判断出精确的招聘数量
     global res
     if count_rc > 0:
         info_url = f'https://www.qichacha.com/company_getinfos?unique={com_id}&companyname={key}&tab=run'
         hds = gh().header()
         time.sleep(random.randint(3, 5))
         res = requests.get(info_url, headers=hds).text
         tree = gm.verify(res)
         count_rc = tree.xpath('//a[contains(@onclick,"#joblist")]/text()'
                               )[0].split('招聘')[1].strip()
         count_rc = int(count_rc)
         localtime = tm().get_localtime()  # 当前时间
         print(localtime)
         print(f'计数器:{count}\n公司ID:{com_id}\n招聘岗位数:{count_rc}')
     status_column = 'status_recruit'  # 表字段名
     count_column = 'count_recruit'  # 表字段名
     gm.upd_status(com_id, status_column, count_column, count_rc)
     return count_rc, res
 def get_page_info(self):  # 解析页面内容
     global project_name, license_status, license_content, expire_time, approval_category, area
     alb = AdmLicenseCc()
     value = alb.cc_judge()
     com_id = value[0]
     com_name = value[1]
     count_cc = value[2]
     key = dk().search_key(com_name)
     count = 0
     index_url = 'https://www.qcc.com'
     page_url = f'{index_url}/company_getinfos?unique={com_id}&companyname={key}&p={page}&tab=run'
     hds = gh().header()
     hds.update({'Referer': f'{index_url}/firm_{com_id}.html'})
     time.sleep(random.randint(3, 5))
     res = requests.get(page_url, headers=hds).text
     if '<script>window.location.href' in res:
         print('访问频繁,需验证!{cc_judge}')
         input('暂停')
     elif '<script>location.href="/user_login"</script>' in res:
         print('Cookie失效,需更换!{cc_judge}')
         input('程序暂停运行!')
     elif '您的账号访问超频,请稍后访问或联系客服人员' in res:
         print('账号访问超频,请更换账号!{cc_judge}')
         input('程序暂停运行!')
     else:
         tree = etree.HTML(res)
         content_li = tree.xpath(
             '//div[@class="tcaption"]/span[contains(text(),"[信用中国]")]/parent::div/following-sibling::table[@class="ntable ntable-odd"]/tr[position()>2]'
         )
         for nbr, content in enumerate(content_li, 1):
             count += 1
             try:
                 license_num = content.xpath('td[1]/text()')[0]
                 dec_book_num = content.xpath('td[2]/text()')[0]
                 license_office = content.xpath('td[3]/text()')[0]
                 dec_date = content.xpath('td[4]/text()')[0]
                 time.sleep(random.randint(1, 2))
                 dt_id = content.xpath(
                     'td[5]/a[@class="xzxukeView"]/@onclick')[0].split(
                         'xzxukeView("')[1].split('")')[0]
                 dt_url = 'https://www.qcc.com/company_xzxukeView'
                 para = {'id': f'{dt_id}'}
                 res_info = requests.post(dt_url, headers=hds,
                                          data=para).text
                 status = json.loads(res_info)['status']
                 if status == 200:
                     data = json.loads(res_info)['data']
                     project_name = data['name']
                     license_status = data['status']
                     license_content = data['content']
                     expire_time = data['expire_time']
                     approval_category = data['type']
                     area = data['province']
                 else:
                     print(f'响应失败!\n状态码:{status}')
                     input('程序暂停运行!')
             except:
                 license_num = None
                 dec_book_num = None
                 license_office = None
                 dec_date = None
                 dt_id = None
                 project_name = None
                 license_status = None
                 license_content = None
                 expire_time = None
                 approval_category = None
             print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format(
                 '-' * 9, count, page, count_page))
             localtime = tm().get_localtime()  # 当前时间
             create_time = localtime
             print(f'当前时间:{create_time}')
             print(
                 f'公司ID:{com_id}\n序号:{license_num}\n决定文书号:{dec_book_num}\n许可机关:{license_office}\n详情ID:{dt_id}\n'
                 f'决定日期:{dec_date}\n项目名称:{project_name}\n许可状态:{license_status}\n许可内容:{license_content}\n截止时间:{expire_time}\n'
                 f'审批类别:{approval_category}\n地域:{area}\n创建/入库时间:{create_time}'
             )
             input('Pause')
    def get_page_info(self):  # 解析页面内容
        alb = AdmLicenseBc()
        value = alb.get_page_count()
        com_id = value[0]
        com_name = value[1]
        count_page = value[2]
        count_record = value[3]
        key = dk().search_key(com_name)
        count = 0
        for page in range(1, count_page + 1):
            index_url = 'https://www.qcc.com'
            page_url = f'{index_url}/company_getinfos?unique={com_id}&companyname={key}&p={page}&tab=run&box=licens'
            hds = gh().header()
            hds.update({'Referer': f'{index_url}/firm_{com_id}.html'})
            time.sleep(random.randint(1, 2))
            res = requests.get(page_url, headers=hds).text
            if '<script>window.location.href' in res:
                print('访问频繁,需验证!{get_page_info[2]}')
                input('暂停')
            elif '<script>location.href="/user_login"</script>' in res:
                print('Cookie失效,需更换!{get_page_info[2]}')
                input('程序暂停运行!')
            elif '您的账号访问超频,请稍后访问或联系客服人员' in res:
                print('账号访问超频,请更换账号!{get_page_info[2]}')
                input('程序暂停运行!')
            else:
                tree = etree.HTML(res)
                content_li = tree.xpath(
                    '//table[@class="ntable ntable-odd"]/tr[position()>2]')
                for nbr, content in enumerate(content_li, 1):
                    count += 1
                    try:
                        license_num = content.xpath('td[1]/text()')[0]
                        license_doc_num = content.xpath('td[2]/text()')[0]
                        license_doc_name = content.xpath('td[3]/text()')[0]
                        valid_period_from = content.xpath('td[4]/text()')[0]
                        valid_period_to = content.xpath('td[5]/text()')[0]
                        license_office = content.xpath('td[6]/text()')[0]
                        license_content = content.xpath('td[7]/text()')[0]
                    except:
                        license_num = None
                        license_doc_num = None
                        license_doc_name = None
                        valid_period_from = None
                        valid_period_to = None
                        license_office = None
                        license_content = None

                    print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format(
                        '-' * 9, count, page, count_page))
                    localtime = tm().get_localtime()  # 当前时间
                    create_time = localtime
                    print(f'当前时间:{create_time}')
                    print(
                        f'公司ID:{com_id}\n序号:{license_num}\n许可文件编号:{license_doc_num}\n许可文件名称:{license_doc_name}\n有效期自:{valid_period_from}\n'
                        f'有效期至:{valid_period_to}\n许可机关:{license_office}\n许可内容:{license_content}'
                    )
                    if license_num == None:
                        ins = """
                        INSERT INTO
                        `com_credit_adm_license_bc`
                        (`com_id`,`license_num`,`license_doc_num`,`license_doc_name`,`valid_period_from`,
                        `valid_period_to`,`license_office`,`license_content`,`create_time`)
                        VALUES
                        (NULL,NULL,NULL,NULL,NULL,
                        NULL,NULL,NULL,NULL);
                        """
                    else:
                        ins = f"""
                        INSERT INTO
                        `com_credit_adm_license_bc`
                        (`com_id`,`license_num`,`license_doc_num`,`license_doc_name`,`valid_period_from`,
                        `valid_period_to`,`license_office`,`license_content`,`create_time`)
                        VALUES
                        ("{com_id}","{license_num}","{license_doc_num}","{license_doc_name}","{valid_period_from}",
                        "{valid_period_to}","{license_office}","{license_content}","{create_time}");
                        """
                    db().inssts(ins)

                    upd = f"""
                        UPDATE 
                        `com_info` 
                        SET
                        `status_credit_adm_license_bc` = 1
                        WHERE 
                        `com_id` = "{com_id}" ;
                        """
                    db().updsts(upd)

        localtime = tm().get_localtime()  # 当前时间
        print('\n{1}\n{0}数据采集完成!{0}\n{1}'.format('+' * 7, '+' * 25))
        print(f'当前时间:{localtime}\n')
        time.sleep(3)
示例#11
0
    def get_page_info(self):  #获取页面详情
        pt = PatentInfo()
        value = pt.get_page_count()
        com_id = value[0]
        com_name = value[1]
        count_page = value[2]

        # 临时代码,供单次补采数据【001】
        # com_id = 'x697654f34422233895571cf26e42268'
        # com_name = '青岛科技大学'
        # count_page = 500
        # 临时代码,供单次补采数据【001】

        if com_id == None:
            pass
        else:
            key = dk().search_key(com_name)
            index_url = value[3]
            count = 0
            start_time = tm().get_localtime()  #当前时间
            for page in range(1, count_page + 1):  #临时代码,供单次补采数据【001】
                # for page in range(1, count_page + 1):
                #     if page == 1:
                #         page_url = f'https://www.qichacha.com/company_getinfos?unique={com_id}&companyname={com_name}&tab=assets'
                page_url = f'{index_url}/company_getinfos?unique={com_id}&companyname={key}&p={page}&tab=assets&box=zhuanli'
                hds = gh().header()
                hds.update({'Referer': f'{index_url}/firm_{com_id}.html'})
                time.sleep(random.randint(1, 2))
                res_pg = requests.get(page_url, headers=hds).text
                if '<script>window.location.href' in res_pg:
                    print('访问频繁,需验证!{get_page_info[1]}')
                    input('暂停')
                elif '<script>location.href="/user_login"</script>' in res_pg:
                    print('Cookie失效,需更换!{get_page_info[1]}')
                    input('程序暂停运行!')
                elif '您的账号访问超频,请稍后访问或联系客服人员' in res_pg:
                    print('账号访问超频,请更换账号!{get_page_info[1]}')
                    input('程序暂停运行!')
                else:
                    tree_pg = etree.HTML(res_pg)
                    content_li = tree_pg.xpath('//table/tr[position()>1]')
                    for content in content_li:
                        count += 1
                        patent_num = content.xpath('td[1]/text()')[0]
                        patent_type = content.xpath('td[2]/text()')[0]
                        patent_pub_num = content.xpath('td[3]/text()')[0]
                        patent_pub_date = content.xpath('td[4]/text()')[0]
                        patent_name = content.xpath(
                            'td[5]/a/text()')[0].strip()
                        patent_link = content.xpath('td[5]/a/@href')[0]
                        patent_id = patent_link.split('_com_')[1]
                        patent_url = ''.join((index_url, patent_link))
                        time.sleep(random.randint(1, 3))
                        res_dt = requests.get(patent_url, headers=hds).text
                        if '<script>window.location.href' in res_dt:
                            print('访问频繁,需验证!{get_page_info[2]}')
                            input('暂停')
                        elif '<script>location.href="/user_login"</script>' in res_dt:
                            print('Cookie失效,需更换!{get_page_info[2]}')
                            input('程序暂停运行!')
                        elif '您的账号访问超频,请稍后访问或联系客服人员' in res_dt:
                            print('账号访问超频,请更换账号!{get_page_info[2]}')
                            input('程序暂停运行!')
                        else:
                            tree_dt = etree.HTML(res_dt)
                            app_num = tree_dt.xpath(
                                '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请号")]/following-sibling::td[1]/text()'
                            )[0].strip()
                            app_date = tree_dt.xpath(
                                '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请日")]/following-sibling::td[1]/text()'
                            )[0].strip()
                            prio_date = tree_dt.xpath(
                                '//table[@class="ntable"]/tbody/tr/td[contains(text(),"优先权日")]/following-sibling::td[1]/text()'
                            )[0].strip()
                            prio_num = tree_dt.xpath(
                                '//table[@class="ntable"]/tbody/tr/td[contains(text(),"优先权号")]/following-sibling::td[1]/text()'
                            )[0].strip()
                            inventor = tree_dt.xpath(
                                '//table[@class="ntable"]/tbody/tr/td[contains(text(),"发明人")]/following-sibling::td[1]/text()'
                            )[0].strip()
                            try:
                                applicant = tree_dt.xpath(
                                    '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请(专利权)人")]/following-sibling::td[1]/a/text()'
                                )[0].strip()
                            except:
                                applicant = tree_dt.xpath(
                                    '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请(专利权)人")]/following-sibling::td[1]'
                                )[0].strip()
                            try:
                                agency = tree_dt.xpath(
                                    '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理机构")]/following-sibling::td[1]/a/text()'
                                )[0].strip()
                            except:
                                agency = tree_dt.xpath(
                                    '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理机构")]/following-sibling::td[1]/text()'
                                )[0].strip()
                            agent = tree_dt.xpath(
                                '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理人")]/following-sibling::td[1]/text()'
                            )[0].strip()
                            ipc = tree_dt.xpath(
                                '//table[@class="ntable"]/tbody/tr/td[contains(text(),"IPC分类号")]/following-sibling::td[1]/text()'
                            )[0].strip().replace(' ', '').replace('\n', '')
                            cpc = tree_dt.xpath(
                                '//table[@class="ntable"]/tbody/tr/td[contains(text(),"CPC分类号")]/following-sibling::td[1]/text()'
                            )[0].strip()
                            app_address = tree_dt.xpath(
                                '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请人地址")]/following-sibling::td[1]/text()'
                            )[0].strip()
                            app_zip_code = tree_dt.xpath(
                                '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请人邮编")]/following-sibling::td[1]/text()'
                            )[0].strip()
                            try:
                                abstract = tree_dt.xpath(
                                    '//table[@class="ntable"]/tbody/tr/td[contains(text(),"摘要")]/following-sibling::td[1]/text()'
                                )[0].strip()
                            except:
                                abstract = tree_dt.xpath(
                                    'string(//table[@class="ntable"]/tbody/tr/td[contains(text(),"摘要")]/following-sibling::td)'
                                ).strip()
                            try:
                                abstract_photo = tree_dt.xpath(
                                    '//table[@class="ntable"]/tbody/tr/td[contains(text(),"摘要附图")]/following-sibling::td[1]/img/@src'
                                )[0].strip()
                            except:
                                abstract_photo = '-'
                            try:
                                claim = tree_dt.xpath(
                                    '//table[@class="ntable"]/tr/td[@class="ea_instructions" and position()=1]/p/text()'
                                )
                                claim = ''.join(claim).replace('"', "'")
                            except:
                                claim = '-'
                            try:
                                instructions = tree_dt.xpath(
                                    '//div[@class="tcaption"]/h3[text()="说明书"]/parent::div/following-sibling::table[@class="ntable"]/tr/td[@class="ea_instructions"]/h1/text()|//div[@class="tcaption"]/h3[text()="说明书"]/parent::div/following-sibling::table[@class="ntable"]/tr/td[@class="ea_instructions"]/h2/text()|//div[@class="tcaption"]/h3[text()="说明书"]/parent::div/following-sibling::table[@class="ntable"]/tr/td[@class="ea_instructions"]/p/text()'
                                )
                                instructions = ''.join(instructions)
                            except:
                                instructions = '-'
                            print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format(
                                '-' * 9, count, page, count_page))
                            localtime = tm().get_localtime()  # 当前时间
                            create_time = localtime
                            print(f'公司ID:{com_id} 当前时间:{localtime}')
                            print(f'公司名称:{com_name}\n专利ID:{patent_id}')
                            print(
                                f'序号:{patent_num}\n专利类型:{patent_type}\n公开(公告)号:{patent_pub_num}\n公开(公告)日期:{patent_pub_date}\n专利名称:{patent_name}\n'
                                f'专利页URL:{patent_url}\n申请号:{app_num}\n申请日期:{app_date}\n优先权日:{prio_date}\n优先权号:{prio_num}\n'
                                f'发明人:{inventor}\n申请(专利权)人:{applicant}\n代理机构:{agency}\n代理人:{agent}\nIPC分类号:{ipc}\n'
                                f'CPC分类号:{cpc}\n申请人地址:{app_address}\n申请人邮编:{app_zip_code}\n摘要:{abstract}\n摘要附图:{abstract_photo}\n'
                                f'权利要求:{claim}\n说明书:{instructions}\n')
                            ins = f"""
                            INSERT INTO  
                            `com_patent`
                            (`com_id`,`patent_num`,`patent_type`,`patent_pub_num`,`patent_pub_date`,
                            `patent_name`,`patent_url`,`app_num`,`app_date`,`prio_date`,
                            `prio_num`,`inventor`,`applicant`,`agency`,`agent`,
                            `ipc`,`cpc`,`app_address`,`app_zip_code`,`abstract`,`abstract_photo`,
                            `claim`,`instructions`,`create_time`,`patent_id`)
                            VALUES 
                            ("{com_id}","{patent_num}","{patent_type}","{patent_pub_num}","{patent_pub_date}",
                            "{patent_name}","{patent_url}","{app_num}","{app_date}","{prio_date}",
                            "{prio_num}","{inventor}","{applicant}","{agency}","{agent}",
                            "{ipc}","{cpc}","{app_address}","{app_zip_code}","{abstract}","{abstract_photo}",
                            "{claim}","{instructions}","{create_time}","{patent_id}");
                            """
                            db().inssts(ins)

                            upd = f"""
                            UPDATE 
                            `com_info` 
                            SET
                            `status_patent` = 1
                            WHERE 
                            `com_id` = "{com_id}" ;
                            """
                            db().updsts(upd)
            localtime = tm().get_localtime()  # 当前时间
            print('\n{1}\n{0}数据采集完成!{0}\n{1}'.format('+' * 7, '+' * 25))
            print(f'当前时间:{localtime}\n')
            time.sleep(3)
示例#12
0
    def get_page_info(self):  #解析页面内容
        cd = Credit()
        value = cd.get_page_count()
        com_id = value[0]
        com_name = value[1]
        count_page = value[2]
        count_record = value[3]
        key = dk().search_key(com_name)
        count = 0
        for page in range(1, count_page + 1):
            index_url = 'https://www.qichacha.com'
            page_url = f'{index_url}/company_getinfos?unique={com_id}&companyname={key}&p={page}&tab=susong&box=zhixing'
            hds = gh().header()
            hds.update({'Referer': f'{index_url}/firm_{com_id}.html'})
            time.sleep(random.randint(1, 2))
            res_pg = requests.get(page_url, headers=hds).text
            if '<script>window.location.href' in res_pg:
                print('访问频繁,需验证!{get_page_info}')
                input('暂停')
            elif '<script>location.href="/user_login"</script>' in res_pg:
                print('Cookie失效,需更换!{get_page_info}')
                input('程序暂停运行!')
            elif '您的账号访问超频,请稍后访问或联系客服人员' in res_pg:
                print('账号访问超频,请更换账号!{get_page_info}')
                input('程序暂停运行!')
            else:
                tree_pg = etree.HTML(res_pg)
                content_li = tree_pg.xpath(
                    '//table[@class="ntable ntable-odd"]/tr[position()>2]')
                for nbr, content in enumerate(content_li, 1):
                    count += 1
                    try:
                        exec_num = content.xpath('td[1]/text()')[0]
                        case_num = content.xpath('td[2]/a/text()')[0]
                        case_id = content.xpath(
                            'td[2]/a[contains(@onclick,"showRelatModal")]/@onclick'
                        )[0].split('zhixing",')[1].split('"')[1]
                        case_url = 'id='.join(
                            ('https://www.qichacha.com/company_zhixingRelat?',
                             case_id))
                        filing_time = content.xpath('td[3]/text()')[0]
                        court_of_exec = content.xpath('td[4]/text()')[0]
                        exec_obj = content.xpath('td[5]/text()')[0]
                        time.sleep(random.randint(1, 2))
                        res_info = requests.get(case_url, headers=hds).text
                        if '<script>window.location.href' in res_info:
                            print('访问频繁,需验证!{get_page_info}')
                            input('暂停')
                        elif '<script>location.href="/user_login"</script>' in res_info:
                            print('Cookie失效,需更换!{get_page_info}')
                            input('程序暂停运行!')
                        elif '您的账号访问超频,请稍后访问或联系客服人员' in res_info:
                            print('账号访问超频,请更换账号!{get_page_info}')
                            input('程序暂停运行!')
                        else:
                            tree_info = etree.HTML(res_info)
                            exec_person = tree_info.xpath(
                                '//table/tbody/tr[1]/td[2]/text()')[0]
                            occ = tree_info.xpath(
                                '//table/tbody/tr[1]/td[4]/text()')[0]
                    except:
                        exec_num = None
                        case_num = None
                        case_id = None
                        case_url = None
                        filing_time = None
                        court_of_exec = None
                        exec_obj = None
                        exec_person = None
                        occ = None
                    print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format(
                        '-' * 9, count, page, count_page))
                    localtime = tm().get_localtime()  # 当前时间
                    create_time = localtime
                    print(f'当前时间:{create_time}')
                    print(
                        f'公司ID:{com_id}\n序号:{exec_num}\n案号:{case_num}\n案例ID:{case_id}\n案例链接:{case_url}\n'
                        f'立案时间:{filing_time}\n执行法院:{court_of_exec}\n执行标的:{exec_obj}\n被执行人:{exec_person}\n身份证号/组织机构代码:{occ}\n'
                    )
                    if exec_num == None:
                        ins = """
                        INSERT INTO
                        `com_credit_execued`
                        (`com_id`,`exec_num`,`case_num`,`case_id`,`filing_time`,
                        `court_of_exec`,`exec_obj`,`exec_person`,`occ`,`create_time`)
                        VALUES
                        (NULL,NULL,NULL,NULL,NULL,
                        NULL,NULL,NULL,NULL);
                        """
                    else:
                        ins = f"""
                        INSERT INTO 
                        `com_credit_execued`
                        (`com_id`,`exec_num`,`case_num`,`case_id`,`filing_time`,
                        `court_of_exec`,`exec_obj`,`exec_person`,`occ`,`create_time`)
                        VALUES 
                        ("{com_id}","{exec_num}","{case_num}","{case_id}","{filing_time}",
                        "{court_of_exec}","{exec_obj}","{exec_person}","{occ}","{create_time}");
                        """
                    db().inssts(ins)

                    upd = f"""
                    UPDATE 
                    `com_info` 
                    SET
                    `status_credit_execued` = 1
                    WHERE 
                    `com_id` = "{com_id}" ;
                    """
                    db().updsts(upd)

        localtime = tm().get_localtime()  # 当前时间
        print('\n{1}\n{0}数据采集完成!{0}\n{1}'.format('+' * 7, '+' * 25))
        print(f'当前时间:{localtime}\n')
        time.sleep(3)
示例#13
0
    def get_page_info(self):  #获取页面详情
        ws = WebSite()
        value = ws.get_page_count()
        com_id = value[0]
        com_name = value[1]
        count_page = value[2]

        # 临时代码,供单次补采数据【001】
        # com_id = 'f1c5372005e04ba99175d5fd3db7b8fc'
        # com_name = '深圳市腾讯计算机系统有限公司'
        # count_page = 45
        # 临时代码,供单次补采数据【001】

        if com_id == None:
            pass
        else:
            key = ws.dk.search_key(com_name)
            index_url = value[3]
            count = 0
            start_time = ws.tm.get_localtime()  #当前时间
            for page in range(1, count_page + 1):  #临时代码,供单次补采数据【001】
                # for page in range(1, count_page + 1):
                #     if page == 1:
                #         page_url = f'https://www.qichacha.com/company_getinfos?unique={com_id}&companyname={com_name}&tab=assets'
                page_url = f'{index_url}/company_getinfos?unique={com_id}&companyname={key}&p={page}&tab=assets&box=website'
                hds = ws.gh.header()
                hds.update({'Referer': f'{index_url}/firm_{com_id}.html'})
                time.sleep(random.randint(1, 2))
                res_pg = requests.get(page_url, headers=hds).text
                tree_pg = ws.gm.verify(res_pg)
                content_li = tree_pg.xpath('//table/tr[position()>1]')
                for content in content_li:
                    count += 1
                    web_num = content.xpath('td[1]/text()')[0]
                    web_name = content.xpath('td[2]/text()')[0]
                    web_site = content.xpath('td[3]/a/text()')
                    if len(web_site) > 1:
                        web_site = web_site
                    elif len(web_site) == 0:
                        web_site = '-'
                    else:
                        web_site = web_site[0]
                    domain_name = content.xpath('td[4]/text()')[0].split('\n')
                    if len(domain_name) > 2:
                        domain_name_li = []
                        for domain in domain_name:
                            if domain != '':
                                domain = domain.strip()
                                domain_name_li.append(domain)
                            else:
                                pass
                        domain_name = domain_name_li
                    else:
                        domain_name = domain_name[1].strip()
                    icp = content.xpath('td[5]/text()')[0].strip()
                    approved_date = content.xpath('td[6]/text()')[0]
                    print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format(
                        '-' * 9, count, page, count_page))
                    localtime = tm().get_localtime()  # 当前时间
                    create_time = localtime
                    print(f'公司ID:{com_id} 当前时间:{localtime}')
                    print(f'公司名称:{com_name}\n序号:{web_num}')
                    print(
                        f'网站名称:{web_name}\n网址:{web_site}\n域名:{domain_name}\n网站备案/许可证号:{icp}\n审核日期:{approved_date}\n'
                    )
                    ins = f"""
                    INSERT INTO
                    `com_web`
                    (`com_id`,`web_num`,`web_name`,`web_site`,`domain_name`,
                    `icp`,`approved_date`,`create_time`)
                    VALUES
                    ("{com_id}","{web_num}","{web_name}","{web_site}","{domain_name}",
                    "{icp}","{approved_date}","{create_time}");
                    """
                    db().inssts(ins)

                    upd = f"""
                    UPDATE
                    `com_info`
                    SET
                    `status_web` = 1
                    WHERE
                    `com_id` = "{com_id}" ;
                    """
                    db().updsts(upd)
            localtime = tm().get_localtime()  # 当前时间
            print('\n{1}\n{0}数据采集完成!{0}\n{1}'.format('+' * 7, '+' * 25))
            print(f'当前时间:{localtime}\n')
            time.sleep(3)
示例#14
0
 def __init__(self):
     self.db = db()
     self.dk = dk()
     self.gh = gh()
     self.gm = gm()
     self.tm = tm()
示例#15
0
    def get_page_info(self):  #获取页面详情
        tmi = TradeMarkInfo()
        value = tmi.get_page_count()
        com_id = value[0]
        com_name = value[1]
        count_page = value[2]
        if com_id == None:
            pass
        else:
            key = tmi.dk.search_key(com_name)
            index_url = value[3]
            count = 0
            for page in range(1, count_page + 1):
                # 'https://www.qichacha.com/company_getinfos?unique=&companyname=&p=2&tab=assets&box=zhuanli&zlpublicationyear=&zlipclist=&zlkindcode=&zllegalstatus='
                page_url = f'{index_url}/company_getinfos?unique={com_id}&companyname={key}&p={page}&tab=assets&box=shangbiao'
                hds = tmi.gh.header()
                hds.update({'Referer': f'{index_url}/firm_{com_id}.html'})
                time.sleep(random.randint(1, 2))
                res_tmi = requests.get(page_url, headers=hds).text
                tree_tmi = etree.HTML(res_tmi)
                content_li = tree_tmi.xpath('//table/tr[position()>1]')
                for content in content_li:
                    count += 1
                    tm_num = content.xpath('td[1]/text()')[0]
                    tm_logo_url = content.xpath('td[2]/img/@src')[0]
                    tm_name = content.xpath('td[3]/text()')[0]
                    tm_status = content.xpath('td[4]/text()')[0]
                    app_date = content.xpath('td[5]/text()')[0]
                    tm_regno = content.xpath('td[6]/text()')[0]
                    tm_int_type = content.xpath('td[7]/text()')[0]
                    trademark_link = content.xpath('td[8]/a/@href')[0]
                    trademark_url = ''.join((index_url, trademark_link))
                    time.sleep(random.randint(1, 3))
                    res_dt = requests.get(trademark_url, headers=hds).text
                    tree_dt = etree.HTML(res_dt)
                    sim_groups = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"类似群")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    app_cn = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请人名称(中文)")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    app_en = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请人名称(英文)")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    app_addr_cn = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请人地址(中文)")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    app_addr_en = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请人地址(英文)")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    first_trial_no = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"初审公告期号")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    first_trial_date = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"初审公告日期")]/following-sibling::td[1]/text()'
                    )[0].strip().replace(' ', '').replace('\n', '')
                    reg_not_peri_no = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"注册公告期号")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    reg_not_peri_date = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"注册公告日期")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    is_comm_tm = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"是否共有商标")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    tm_type = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"商标类型")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    exclu_right_limit = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"专用权期限")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    tm_form = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"商标形式")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    int_reg_date = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"国际注册日期")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    later_scheduled_date = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"后期指定日期")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    prio_date = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"优先权日期")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    try:
                        agency = tree_dt.xpath(
                            '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理/办理机构")]/following-sibling::td[1]/a/text()'
                        )[0].strip()
                    except:
                        agency = tree_dt.xpath(
                            '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理/办理机构")]/following-sibling::td[1]/text()'
                        )[0].strip()
                    service = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"商品/服务")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format(
                        '-' * 9, count, page, count_page))
                    localtime = tm().get_localtime()  # 当前时间
                    create_time = localtime
                    print(f'当前时间:{localtime}')
                    print(f'公司ID:{com_id}\n公司名称:{com_name}')
                    print(
                        f'序号:{tm_num}\n商标LOGO URL:{tm_logo_url}\n商标名称:{tm_name}\n商标状态:{tm_status}\n申请时间:{app_date}\n'
                        f'申请/注册号:{tm_regno}\n国际类型:{tm_int_type}\n类似群:{sim_groups}\n申请人名称(中文):{app_cn}\n申请人名称(英文):{app_en}\n'
                        f'申请人地址(中文):{app_addr_cn}\n申请人地址(英文):{app_addr_en}\n初审公告期号:{first_trial_no}\n初审公告日期:{first_trial_date}\n注册公告期号:{reg_not_peri_no}\n'
                        f'注册公告日期:{reg_not_peri_date}\n是否共有商标:{is_comm_tm}\n商标类型:{tm_type}\n专用权期限:{exclu_right_limit}\n商标形式:{tm_form}\n'
                        f'国际注册日期:{int_reg_date}\n后期指定日期:{later_scheduled_date}\n优先权日期:{prio_date}\n代理机构:{agency}\n商品/服务:{service}'
                    )
                    ins = f"""
                    INSERT INTO  
                    `com_trademark`
                    (`com_id`,`tm_num`,`tm_logo_url`,`tm_name`,`tm_status`,
                    `app_date`,`tm_regno`,`tm_int_type`,`sim_groups`,`app_cn`,
                    `app_en`,`app_addr_cn`,`app_addr_en`,`first_trial_no`,`first_trial_date`,
                    `reg_not_peri_no`,`reg_not_peri_date`,`is_comm_tm`,`tm_type`,`exclu_right_limit`,
                    `tm_form`,`int_reg_date`,`later_scheduled_date`,`prio_date`,`agency`,
                    `service`,`create_time`)
                    VALUES 
                    ("{com_id}","{tm_num}","{tm_logo_url}","{tm_name}","{tm_status}",
                    "{app_date}","{tm_regno}","{tm_int_type}","{sim_groups}","{app_cn}",
                    "{app_en}","{app_addr_cn}","{app_addr_en}","{first_trial_no}","{first_trial_date}",
                    "{reg_not_peri_no}","{reg_not_peri_date}","{is_comm_tm}","{tm_type}","{exclu_right_limit}",
                    "{tm_form}","{int_reg_date}","{later_scheduled_date}","{prio_date}","{agency}",
                    "{service}","{create_time}");
                    """
                    db().inssts(ins)

                    upd = f"""
                    UPDATE 
                    `com_info` 
                    SET
                    `status_tm` = 1
                    WHERE 
                    `com_id` = "{com_id}" ;
                    """
                    db().updsts(upd)
                    # input('暂停')
            localtime = tm().get_localtime()  # 当前时间
            print('\n{1}\n{0}数据采集完成!{0}\n{1}'.format('+' * 7, '+' * 25))
            print(f'当前时间:{localtime}')
示例#16
0
    def get_page_info(self):  #获取页面详情
        cos = CprOfSoft()
        value = cos.get_page_count()
        com_id = value[0]
        com_name = value[1]
        count_page = value[2]

        # 临时代码,供单次补采数据【001】
        # com_id = 'd02224f92dc49fb497774c88dd2c83c1'
        # com_name = '中译语通文娱科技(青岛)有限公司'
        # count_page = 2
        # 临时代码,供单次补采数据【001】

        if com_id == None:
            pass
        else:
            key = cos.dk.search_key(com_name)
            index_url = value[3]
            count = 0
            start_time = cos.tm.get_localtime()  #当前时间
            for page in range(1, count_page + 1):  #临时代码,供单次补采数据【001】
                # for page in range(1, count_page + 1):
                #     if page == 1:
                #         page_url = f'https://www.qichacha.com/company_getinfos?unique={com_id}&companyname={com_name}&tab=assets'
                page_url = f'{index_url}/company_getinfos?unique={com_id}&companyname={key}&p={page}&tab=assets&box=rjzzq'
                hds = cos.gh.header()
                hds.update({'Referer': f'{index_url}/firm_{com_id}.html'})
                time.sleep(random.randint(1, 2))
                res_pg = requests.get(page_url, headers=hds).text
                tree_pg = cos.gm.verify(res_pg)
                content_li = tree_pg.xpath('//table/tr[position()>1]')
                for content in content_li:
                    count += 1
                    soft_num = content.xpath('td[1]/text()')[0]
                    soft_name = content.xpath('td[2]/text()')[0]
                    try:
                        soft_ver_no = content.xpath('td[3]/text()')[0]
                    except:
                        soft_ver_no = '-'
                    soft_pub_date = content.xpath('td[4]/text()')[0].strip()
                    soft_short_name = content.xpath('td[5]/text()')[0].strip()
                    soft_reg_no = content.xpath('td[6]/text()')[0]
                    reg_approval_date = content.xpath('td[7]/text()')[0]
                    print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format(
                        '-' * 9, count, page, count_page))
                    localtime = tm().get_localtime()  # 当前时间
                    create_time = localtime
                    print(f'公司ID:{com_id} 当前时间:{localtime}')
                    print(f'公司名称:{com_name}')
                    print(
                        f'序号:{soft_num}\n软件名称:{soft_name}\n版本号:{soft_ver_no}\n发布日期:{soft_pub_date}\n软件简称:{soft_short_name}\n'
                        f'登记号:{soft_reg_no}\n登记批准号:{reg_approval_date}\n')
                    ins = f"""
                    INSERT INTO  
                    `com_cpr_of_soft`
                    (`com_id`,`soft_num`,`soft_name`,`soft_ver_no`,`soft_pub_date`,
                    `soft_short_name`,`soft_reg_no`,`reg_approval_date`,`create_time`)
                    VALUES 
                    ("{com_id}","{soft_num}","{soft_name}","{soft_ver_no}","{soft_pub_date}",
                    "{soft_short_name}","{soft_reg_no}","{reg_approval_date}","{create_time}");
                    """
                    cos.db.inssts(ins)

                    upd = f"""
                    UPDATE 
                    `com_info` 
                    SET
                    `status_cpr_of_soft` = 1
                    WHERE 
                    `com_id` = "{com_id}" ;
                    """
                    cos.db.updsts(upd)
            localtime = cos.tm.get_localtime()  # 当前时间
            print('\n{1}\n{0}数据采集完成!{0}\n{1}'.format('+' * 7, '+' * 25))
            print(f'当前时间:{localtime}\n')
            time.sleep(3)
示例#17
0
    def get_page_info(self):  #获取页面详情
        pt = PatentInfo()
        value = pt.get_page_count()
        com_id = value[0]
        com_name = value[1]
        count_page = value[2]
        if com_id == None:
            pass
        else:
            key = pt.search_key(com_name)
            index_url = value[3]
            count = 0
            for page in range(1, count_page + 1):
                # 'https://www.qichacha.com/company_getinfos?unique=&companyname=&p=2&tab=assets&box=zhuanli&zlpublicationyear=&zlipclist=&zlkindcode=&zllegalstatus='
                page_url = f'{index_url}/company_getinfos?unique={com_id}&companyname={key}&p={page}&tab=assets&box=zhuanli'
                hds = gh().header()
                hds.update({'Referer': f'{index_url}/firm_{com_id}.html'})
                time.sleep(random.randint(1, 2))
                res_pg = requests.get(page_url, headers=hds).text
                tree_pg = etree.HTML(res_pg)
                content_li = tree_pg.xpath('//table/tr[position()>1]')
                for content in content_li:
                    count += 1
                    patent_num = content.xpath('td[1]/text()')[0]
                    patent_type = content.xpath('td[2]/text()')[0]
                    patent_pub_num = content.xpath('td[3]/text()')[0]
                    patent_pub_date = content.xpath('td[4]/text()')[0]
                    patent_name = content.xpath('td[5]/a/text()')[0].strip()
                    patent_link = content.xpath('td[5]/a/@href')[0]
                    patent_url = ''.join((index_url, patent_link))
                    time.sleep(random.randint(1, 3))
                    res_dt = requests.get(patent_url, headers=hds).text
                    tree_dt = etree.HTML(res_dt)
                    app_num = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请号")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    app_date = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请日")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    prio_date = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"优先权日")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    prio_num = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"优先权号")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    inventor = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"发明人")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    try:
                        applicant = tree_dt.xpath(
                            '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请(专利权)人")]/following-sibling::td[1]/a/text()'
                        )[0].strip()
                    except:
                        applicant = tree_dt.xpath(
                            '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请(专利权)人")]/following-sibling::td[1]'
                        )[0].strip()
                    try:
                        agency = tree_dt.xpath(
                            '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理机构")]/following-sibling::td[1]/a/text()'
                        )[0].strip()
                    except:
                        agency = tree_dt.xpath(
                            '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理机构")]/following-sibling::td[1]/text()'
                        )[0].strip()
                    agent = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"代理人")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    ipc = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"IPC分类号")]/following-sibling::td[1]/text()'
                    )[0].strip().replace(' ', '').replace('\n', '')
                    cpc = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"CPC分类号")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    app_address = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请人地址")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    app_zip_code = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"申请人邮编")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    abstract = tree_dt.xpath(
                        '//table[@class="ntable"]/tbody/tr/td[contains(text(),"摘要")]/following-sibling::td[1]/text()'
                    )[0].strip()
                    try:
                        abstract_photo = tree_dt.xpath(
                            '//table[@class="ntable"]/tbody/tr/td[contains(text(),"摘要附图")]/following-sibling::td[1]/img/@src'
                        )[0].strip()
                    except:
                        abstract_photo = '-'
                    try:
                        claim = tree_dt.xpath(
                            '//table[@class="ntable"]/tr/td[@class="ea_instructions" and position()=1]/p/text()'
                        )
                        claim = ''.join(claim)
                    except:
                        claim = '-'
                    try:
                        instructions = tree_dt.xpath(
                            '//div[@class="tcaption"]/h3[text()="说明书"]/parent::div/following-sibling::table[@class="ntable"]/tr/td[@class="ea_instructions"]/h1/text()|//div[@class="tcaption"]/h3[text()="说明书"]/parent::div/following-sibling::table[@class="ntable"]/tr/td[@class="ea_instructions"]/h2/text()|//div[@class="tcaption"]/h3[text()="说明书"]/parent::div/following-sibling::table[@class="ntable"]/tr/td[@class="ea_instructions"]/p/text()'
                        )
                        instructions = ''.join(instructions)
                    except:
                        instructions = '-'
                    print('\n{0}--总第{1}条----{2}/{3}页--{0}\n'.format(
                        '-' * 9, count, page, count_page))
                    localtime = tm().get_localtime()  # 当前时间
                    print(f'公司ID:{com_id} 当前时间:{localtime}')
                    print(
                        f'序号:{patent_num}\n专利类型:{patent_type}\n公开(公告)号:{patent_pub_num}\n公开(公告)日期:{patent_pub_date}\n专利名称:{patent_name}\n'
                        f'专利页URL:{patent_url}\n申请号:{app_num}\n申请日期:{app_date}\n优先权日:{prio_date}\n优先权号:{prio_num}\n'
                        f'发明人:{inventor}\n申请(专利权)人:{applicant}\n代理机构:{agency}\n代理人:{agent}\nIPC分类号:{ipc}\n'
                        f'CPC分类号:{cpc}\n申请人地址:{app_address}\n申请人邮编:{app_zip_code}\n摘要:{abstract}\n摘要附图:{abstract_photo}\n'
                        f'权利要求:{claim}\n说明书:{instructions}')
                    ins = f"""
                    INSERT INTO  
                    `com_patent`
                    (`com_id`,`patent_num`,`patent_type`,`patent_pub_num`,`patent_pub_date`,
                    `patent_name`,`patent_url`,`app_num`,`app_date`,`prio_date`,
                    `prio_num`,`inventor`,`applicant`,`agency`,`agent`,
                    `ipc`,`cpc`,`app_address`,`app_zip_code`,`abstract`,`abstract_photo`,
                    `claim`,`instructions`)
                    VALUES 
                    ("{com_id}","{patent_num}","{patent_type}","{patent_pub_num}","{patent_pub_date}",
                    "{patent_name}","{patent_url}","{app_num}","{app_date}","{prio_date}",
                    "{prio_num}","{inventor}","{applicant}","{agency}","{agent}",
                    "{ipc}","{cpc}","{app_address}","{app_zip_code}","{abstract}","{abstract_photo}",
                    "{claim}","{instructions}");
                    """
                    db().inssts(ins)

                    upd = f"""
                    UPDATE 
                    `com_info` 
                    SET
                    `status` = 1
                    WHERE 
                    `com_id` = "{com_id}" ;
                    """
                    db().updsts(upd)
                    # input('暂停')
            localtime = tm().get_localtime()  # 当前时间
            print('\n{1}\n{0}数据采集完成!{0}\n{1}'.format('+' * 7, '+' * 25))
            print(f'当前时间:{localtime}')
示例#18
0
 def parse_info(self, tree, com_id, com_name, page, sh_page_count):
     sh = StockHolder()
     count = (page - 1) * 50
     if tree == None:
         print('无相关数据!\n')
     else:
         # 引入verify_stockholder_args方法 -- 2019-11-26
         stockholder_args = sh.verify_stockholder_args(tree)
         stockholder_li = tree.xpath(
             '//table[contains(@class,"ntable ntable-odd npth")]/tr[position()>1]|//table[contains(@class,"ntable ntable-odd npth")]/tbody/tr[position()>1]'
         )
         for stockholder_info in stockholder_li:
             count += 1
             stockholder_num = stockholder_info.xpath(
                 'td[1]/text()')[0].strip()
             stockholder_name = stockholder_info.xpath(
                 'td[2]//*[@class="seo font-14"]/text()')[0].strip()
             if stockholder_info.xpath('td[3]/text()')[0].strip() == '':
                 stockholder_rate = stockholder_info.xpath(
                     'td[3]/span/text()')[0].strip()
             else:
                 stockholder_rate = stockholder_info.xpath(
                     'td[3]/text()')[0].strip()
             if '最终受益股份' not in stockholder_args:
                 if stockholder_info.xpath('td[4]/text()')[0].strip() == '':
                     subscribed_capital_amount = stockholder_info.xpath(
                         'td[4]/span/text()')[0].strip()
                 else:
                     subscribed_capital_amount = stockholder_info.xpath(
                         'td[4]/text()')[0].strip()
                 if stockholder_info.xpath('td[5]/text()')[0].strip() == '':
                     subscribed_capital_date = stockholder_info.xpath(
                         'td[5]/span/text()')[0].strip()
                 else:
                     subscribed_capital_date = stockholder_info.xpath(
                         'td[5]/text()')[0].strip()
             else:
                 if stockholder_info.xpath('td[5]/text()')[0].strip() == '':
                     subscribed_capital_amount = stockholder_info.xpath(
                         'td[5]/span/text()')[0].strip()
                 else:
                     subscribed_capital_amount = stockholder_info.xpath(
                         'td[5]/text()')[0].strip()
                 if stockholder_info.xpath('td[6]/text()')[0].strip() == '':
                     subscribed_capital_date = stockholder_info.xpath(
                         'td[6]/span/text()')[0].strip()
                 else:
                     subscribed_capital_date = stockholder_info.xpath(
                         'td[6]/text()')[0].strip()
             if '实缴出资额' not in stockholder_args:
                 contributed_capital_amount = '--'
                 contributed_capital_date = '--'
             else:
                 if '最终受益股份' not in stockholder_args:
                     if stockholder_info.xpath(
                             'td[6]/text()')[0].strip() == '':
                         contributed_capital_amount = stockholder_info.xpath(
                             'td[6]/span/text()')[0].strip()
                         contributed_capital_date = stockholder_info.xpath(
                             'td[7]/span/text()')[0].strip()
                     else:
                         contributed_capital_amount = stockholder_info.xpath(
                             'td[6]/text()')[0].strip()
                         contributed_capital_date = stockholder_info.xpath(
                             'td[7]/text()')[0].strip()
                 else:
                     if stockholder_info.xpath(
                             'td[7]/text()')[0].strip() == '':
                         contributed_capital_amount = stockholder_info.xpath(
                             'td[7]/span/text()')[0].strip()
                         contributed_capital_date = stockholder_info.xpath(
                             'td[8]/span/text()')[0].strip()
                     else:
                         contributed_capital_amount = stockholder_info.xpath(
                             'td[7]/text()')[0].strip()
                         contributed_capital_date = stockholder_info.xpath(
                             'td[8]/text()')[0].strip()
             if '关联产品/机构' in stockholder_args:
                 if '最终受益股份' not in stockholder_args and '实缴出资额' not in stockholder_args:
                     if stockholder_info.xpath(
                             'td[6]/text()')[0].strip() == '':
                         relation_product = stockholder_info.xpath(
                             'td[6]/a/text()')[0].strip()
                     else:
                         relation_product = stockholder_info.xpath(
                             'td[6]/text()')[0].strip()
                 elif '最终受益股份' not in stockholder_args and '实缴出资额' in stockholder_args:
                     if stockholder_info.xpath(
                             'td[8]/text()')[0].strip() == '':
                         relation_product = stockholder_info.xpath(
                             'td[8]/a/text()')[0].strip()
                     else:
                         relation_product = stockholder_info.xpath(
                             'td[8]/text()')[0].strip()
                 elif '最终受益股份' in stockholder_args and '实缴出资额' not in stockholder_args:
                     if stockholder_info.xpath(
                             'td[7]/text()')[0].strip() == '':
                         relation_product = stockholder_info.xpath(
                             'td[7]/a/text()')[0].strip()
                     else:
                         relation_product = stockholder_info.xpath(
                             'td[7]/text()')[0].strip()
                 else:
                     if stockholder_info.xpath(
                             'td[9]/text()')[0].strip() == '':
                         relation_product = stockholder_info.xpath(
                             'td[9]/a/text()')[0].strip()
                     else:
                         relation_product = stockholder_info.xpath(
                             'td[9]/text()')[0].strip()
             else:
                 relation_product = '--'
             localtime = tm().get_localtime()  # 当前时间
             create_time = localtime
             print('\n{0}--总第{1}条----第{2}/{3}页----{0}\n'.format(
                 '-' * 9, count, page, sh_page_count))
             print(f'当前时间:{create_time}')
             print(f'公司ID:{com_id}\n公司名称:{com_name}')
             print(
                 f'序号:{stockholder_num}\n股东:{stockholder_name}\n持股比例:{stockholder_rate}\n认缴出资额:{subscribed_capital_amount}\n认缴出资日期:{subscribed_capital_date}\n'
                 f'实缴出资额:{contributed_capital_amount}\n实缴出资日期:{contributed_capital_date}\n关联产品/机构:{relation_product}\n'
             )
             ins = f"""
             INSERT INTO `com_stockholder`
             (com_id,stockholder_num,stockholder_name,stockholder_rate,subscribed_capital_amount,
             subscribed_capital_date,contributed_capital_amount,contributed_capital_date,relation_product,create_time)
             VALUES 
             ("{com_id}","{stockholder_num}","{stockholder_name}","{stockholder_rate}","{subscribed_capital_amount}",
             "{subscribed_capital_date}","{contributed_capital_amount}","{contributed_capital_date}","{relation_product}","{create_time}");
             """
             # udp = f"""
             # UPDATE `com_info`
             # SET `status_stockholder` = "9"
             # AND `count_stockholder` = "{count_sh}"
             # WHERE `com_id` = "{com_id}";"""
             self.db.inssts(ins)