Python Selector.split 예제들, scrapy.Selector.split Python 예제들

예제 #1

0

파일 보기

파일: another_province_into_zhejiang.py 프로젝트: satori-koishi/Spider_All

 def company_information(self, response):
     company_name = Selector(response=response).xpath(
         '//td[@style="text-align: left"]/text()').extract_first()
     number = Selector(response=response).xpath(
         '//td[@width="150"]')[2].xpath('text()').extract_first()
     company_name = company_name.split()[0]
     number = number.split()

예제 #2

0

파일 보기

파일: Liaoning.py 프로젝트: satori-koishi/Spider_All

 def company_information(self, response):
     company_name = Selector(response=response).xpath(
         '//td[@class="name_level3"]/text()').extract_first()
     number = Selector(response=response).xpath(
         '//td[@id="LicenseNum"]/text()').extract_first()
     address = Selector(response=response).xpath(
         '//td[@id="Description"]/text()').extract_first()
     company_name = company_name.split()[0]
     address = address.split()[0]
     self.data['companyName'] = company_name
     if number != None:
         number = number.split()[0]
         if len(number) != 18:
             self.data['licenseNum'] = ''
         else:
             self.data['licenseNum'] = number
     else:
         self.data['licenseNum'] = ''
     if address == None:
         self.data['contactAddress'] = ''
     else:
         self.data['contactAddress'] = address
     print(self.data)
     yield scrapy.Request(
         url=
         'https://api.maotouin.com/rest/companyInfo/addCompanyRecord.htm',
         # url='http://192.168.199.188:8080/web/rest/companyInfo/addCompanyRecord.htm',
         method="POST",
         headers={'Content-Type': 'application/json'},
         body=json.dumps(self.data),
         callback=self.zz,
         meta={
             'company_name': company_name,
             'data': self.data
         })

예제 #3

0

파일 보기

        def xt_admin_date(cls, raw_person):
            # Extract administration
            admin_datestring = Selector(text=raw_person).xpath(
                '//td[1]/span/@title').extract()[0]
            if ';' in admin_datestring:
                admin_datestring = admin_datestring.split(";")[0]

            if ',' in admin_datestring:
                admin_datestring = admin_datestring.split(",")[0]

            try:
                if " - " in admin_datestring:
                    start_date = _clean(admin_datestring.split(' - ')[0])
                    end_date = _clean(admin_datestring.split(' - ')[1])

                    start_date = datetime.datetime.strptime(
                        start_date, "%d.%m.%Y").date()
                    end_date = datetime.datetime.strptime(
                        end_date, "%d.%m.%Y").date()
                else:
                    start_date = datetime.datetime.strptime(
                        _clean(admin_datestring), "%d.%m.%Y").date()
                    end_date = None
            except:
                logger.error(
                    "Couldn't extract date from datestring {}".format(
                        admin_datestring))
                import ipdb
                ipdb.set_trace()

            return (start_date, end_date)

예제 #4

0

파일 보기

 def company_information(self, response):
     company_name = Selector(response=response).xpath(
         '//td[@colspan="3"]/text()').extract_first()
     number = Selector(response=response).xpath('//table[@class="detailTable"]')[0] \
         .xpath('./tr[2]/td[4]/text()').extract_first()
     company_name = company_name.split()[0]
     repeat = self.r.sadd('Company_name', company_name)
     if repeat:
         number = number.split()
         if number:
             number = number[0]
             if len(number) != 18:
                 self.data['licenseNum'] = ''
             else:
                 self.data['licenseNum'] = number
         else:
             self.data['licenseNum'] = ''
         self.data['companyName'] = company_name
         yield scrapy.Request(
             url=
             'https://api.maotouin.com/rest/companyInfo/addCompanyRecord.htm',
             # url='http://192.168.199.188:8080/web/rest/companyInfo/addCompanyRecord.htm',
             method="POST",
             headers={'Content-Type': 'application/json'},
             body=json.dumps(self.data),
             callback=self.zz,
             meta={
                 'company_name': company_name,
                 'data': self.data
             })
     else:
         print('此公司信息已经存在', company_name)

예제 #5

0

파일 보기

 def company_information(self, response):
     company_name = Selector(response=response).xpath(
         '//span[@id="ctl00_ContentPlaceHolder1_FormView1_Label10"]/text()'
     ).extract_first()
     number = Selector(response=response).xpath('//td[@class="inquiry_intitleb"]')[5] \
         .xpath('./span/text()').extract_first()
     company_name = company_name.split()[0]
     repeat = self.r.sadd('Company_name', company_name)
     if repeat:
         self.data['companyName'] = company_name
         if number is not None:
             number = number.split()[0]
             if len(number) == 18:
                 self.data['licenseNum'] = number
         print(self.data)
         yield scrapy.Request(
             url=
             'https://api.maotouin.com/rest/companyInfo/addCompanyRecord.htm',
             # url='http://192.168.199.188:8080/web/rest/companyInfo/addCompanyRecord.htm',
             method="POST",
             headers={'Content-Type': 'application/json'},
             body=json.dumps(self.data),
             callback=self.zz,
             meta={
                 'company_name': company_name,
                 'data': self.data
             })
         person_zz = Selector(
             response=response).xpath('//table[@id="GridView2"]')
         print(len(person_zz), 'zzzzzzzzzzzzzzzzzzzzzzz')
     else:
         print('此公司信息已经存在', company_name)

예제 #6

0

파일 보기

 def company_information(self, response):
     company_name = Selector(response=response).xpath(
         '//td[@class="name_level3"]')[0].xpath('text()').extract_first()
     number = Selector(
         response=response).xpath('//td[@id="Td3"]/text()').extract_first()
     person = Selector(response=response).xpath(
         '//td[@id="EconType"]/text()').extract_first()
     address = Selector(response=response).xpath(
         '//td[@id="LicenseNum"]/text()').extract_first()
     phone = Selector(response=response).xpath(
         '//td[@id="RegPrin"]/text()').extract_first()
     company_name = company_name.split()[0]
     if phone is None:
         self.data['contactPhone'] = ''
     else:
         phone = phone.split()
         phone = phone[0]
         if phone == '/':
             self.data['contactPhone'] = ''
         else:
             self.data['contactPhone'] = phone
     if person is None:
         self.data['contactMan'] = ''
     else:
         person = person.split()
         person = person[0]
         self.data['contactMan'] = person
     if address is None:
         self.data['contactAddress'] = ''
     else:
         address = address.split()
         address = address[0]
         self.data['contactAddress'] = address
     if number is not None:
         number = number.split()
         number = number[0]
         if len(number) != 18:
             self.data['licenseNum'] = ''
         else:
             self.data['licenseNum'] = number
     else:
         self.data['licenseNum'] = ''
     self.data['companyName'] = company_name
     # print(self.data)
     yield scrapy.Request(
         url=
         'https://api.maotouin.com/rest/companyInfo/addCompanyRecord.htm',
         # url='http://192.168.199.188:8080/web/rest/companyInfo/addCompanyRecord.htm',
         method="POST",
         headers={'Content-Type': 'application/json'},
         body=json.dumps(self.data),
         callback=self.zz,
         meta={
             'company_name': company_name,
             'data': self.data
         })

예제 #7

0

파일 보기

    def company_info(self, response):
        company_name = Selector(response=response).xpath(
            '//td[@colspan="3"]')[0].xpath('./a/@title').extract_first()
        number = Selector(response=response).xpath(
            '//td[@colspan="3"]')[3].xpath('text()').extract_first()
        if number.split():
            number = number.split()[0]
            if len(number) == 18:
                number = number
        else:
            number = ''

        tr = Selector(
            response=response).xpath('//table[@id="table_credit"]/tbody/tr')
        just_z = Selector(response=response).xpath(
            '//table[@id="table_credit"]/tbody/tr[1]/td[1]/text()'
        ).extract_first()
        if just_z != '没有相关数据':
            for t in tr:
                credit_evaluate = {
                    'type_name': '',
                    'e_result': '',
                    'have_date': '',
                    'department': '',
                    'validity_time': '',
                    'company_name': company_name,
                    'number': number
                }
                # 类别
                type_name = t.xpath('./td/text()')[0].extract().split()[0]
                credit_evaluate['type_name'] = type_name

                # 评价结果
                e_result = t.xpath('./td/text()')[1].extract()
                if e_result is not None:
                    credit_evaluate['e_result'] = e_result

                # 颁发日期
                have_date = t.xpath('./td/text()')[2].extract()
                if have_date is not None:
                    credit_evaluate['have_date'] = have_date

                # 评价机构
                department = t.xpath('./td/text()')[3].extract()
                if department is not None:
                    credit_evaluate['department'] = department

                # 有效期
                validity_time = t.xpath('./td/text()')[4].extract()
                if validity_time is not None:
                    credit_evaluate['validity_time'] = validity_time

                print(credit_evaluate)

예제 #8

0

파일 보기

파일: HydraulicGoodAction.py 프로젝트: satori-koishi/Spider_All

    def company_info(self, response):
        company_name = Selector(response=response).xpath('//td[@colspan="3"]')[0].xpath('./a/@title').extract_first()
        number = Selector(response=response).xpath('//td[@colspan="3"]')[3].xpath(
            'text()').extract_first()
        if number.split():
            number = number.split()[0]
            if len(number) == 18:
                number = number
        else:
            number = ''

        tr = Selector(response=response).xpath('//table[@id="table_good"]/tbody/tr')
        if tr:
            for t in tr:
                good_action = {'project_name': '', 'good_grade': '',
                               'have_date': '', 'send_department': '',
                               'have_number': '', 'company_name': company_name,
                               'number': number
                               }
                # 项目名称
                if len(t.xpath('./td/text()')) >= 2:
                    try:
                        project_name = t.xpath('./td/text()')[0].extract().split()[0]
                        good_action['project_name'] = project_name
                    except IndexError:
                        continue

                    # 奖项级别
                    good_grade = t.xpath('./td/text()')[1].extract()
                    if good_grade is not None:
                        if good_grade.split():
                            good_action['good_grade'] = good_grade.split()[0]

                    # 颁发单位
                    send_department = t.xpath('./td/text()')[2].extract()
                    if send_department is not None:
                        send_department = send_department.split()[0]
                        good_action['send_department'] = send_department

                    # 颁奖文号
                    have_number = t.xpath('./td/text()')[3].extract()
                    if have_number is not None or have_number != '/':
                        good_action['have_number'] = have_number

                    # 颁奖时间
                    try:
                        have_date = t.xpath('./td/text()')[4].extract()
                        if have_date is not None:
                            good_action['have_date'] = have_date
                    except IndexError:
                        continue
                    print(good_action)

예제 #9

0

파일 보기

 def company_information(self, response):
     # print(response.text)
     company_name = Selector(response=response).xpath('//td[@colspan="5"]')[0].xpath('text()').extract_first()
     # company_name = Selector(response=response).xpath('//td[@colspan="5"]')
     print(company_name)
     address = Selector(response=response).xpath('//td[@colspan="5"]/text()')[1].extract()
     number = Selector(response=response).xpath('//div[@class="detail_list"]/table/tr[2]/td[6]/text()').extract_first()
     person_name = Selector(response=response).xpath('//div[@class="detail_list"]/table/tr[7]/td[2]/text()').extract_first()
     print(company_name, address, number, person_name, 'AAAAAAAAAAAAAAAAAAAAAA')
     print(company_name, number, person_name, 'AAAAAAAAAAAAAAAAAAAAAA')
     data = {}
     print()
     data['companyName'] = company_name
     number = number.split()
     if number != []:
         number = number[0]
         if len(number) == 18:
             data['licenseNum'] = number
         else:
             data['licenseNum'] = ''
     else:
         data['licenseNum'] = ''
     person_name = person_name.split()
     print(person_name, type(person_name), 'AAAAAAAAAAAAAAAAAAAA')
     if person_name != []:
         person_name = person_name[0]
         print(person_name)
         data['contactMan'] = person_name
     else:
         data['contactMan'] = ''
     if address != None:
         adderss = address.split()[0]
         data['contactAddress'] = adderss
     else:
         data['contactAddress'] = ''
     data['companyArea'] = '浙江省'
     data['area'] = ''
     data['contactPhone'] = ''
     data['token'] = self.token
     print(data)
     yield scrapy.Request(
         url='https://api.maotouin.com/rest/companyInfo/addCompanyRecord.htm',
         # url='http://192.168.199.188:8080/web/rest/companyInfo/addCompanyRecord.htm',
         method="POST",
         headers={'Content-Type': 'application/json'},
         body=json.dumps(data),
         callback=self.zz,
         meta={'company_name': company_name, 'data': data},
         dont_filter=True
     )

예제 #10

0

파일 보기

파일: d1rankings.py 프로젝트: demayj10/LaxStats2-scraper

    def parse(self, response):
        ranking = Ranking()
        formatter = Data_Formatter()

        url = response.url
        link_checker = "www.ncaa.com/rankings"
        if link_checker not in url:
            print(
                "---------------------------------------------------------------------"
            )
            print("Invalid link: {}, skipping".format(url))
            print(
                "---------------------------------------------------------------------"
            )
        else:
            table_body = response.xpath(
                '//*[@id="block-bespin-content"]/div/article/table/tbody[1]')
            rows = table_body.css('tr').getall()
            for row in rows:
                data_cells = Selector(text=row).css('td').getall()
                rank = Selector(text=data_cells[0]).css('td::text').get()
                team = Selector(text=data_cells[1]).css('td::text').get()
                try:
                    team = team.split('(')[0].strip()

                    ranking['rank'] = rank
                    team = formatter.lengthen_abbreviated(team)

                    ranking['team'] = team

                    yield ranking
                except:
                    print("Error with splitting team ranking item")

            print("Scraped Rankings @ {}".format(response.url))

예제 #11

0

파일 보기

파일: douban.py 프로젝트: sljchina/Python004

 def parse_comment(self, response):
     movie_name = Selector(response).xpath(
         '//h1/text()').extract_first().replace('短评', '').strip()
     comments = SelectorList(
         Selector(response).xpath('//div[@class="comment"]').extract())
     for comment in comments:
         shorts = Selector(
             text=comment).xpath('//p/span/text()').extract_first()
         votes = Selector(text=comment).xpath(
             '//h3/span[@class="comment-vote"]/span/text()').extract_first(
             )
         stars = Selector(text=comment).xpath(
             '//h3/span[@class="comment-info"]/span[contains(@class,"rating")]/@class'
         ).extract_first()
         if stars:
             stars = stars.split()[0].replace('allstar',
                                              '').strip().replace('0', '')
         else:
             stars = 0
         comment_time = Selector(text=comment).xpath(
             '//h3/span[@class="comment-info"]/span[@class="comment-time "]/text()'
         ).extract_first()
         comment_item = CommentItem()
         comment_item['movie_name'] = movie_name
         comment_item['shorts'] = shorts
         comment_item['stars'] = stars
         comment_item['votes'] = votes
         comment_item['comment_time'] = comment_time
         yield comment_item

예제 #12

0

파일 보기

파일: haryana.py 프로젝트: niel99/electoral_rolls

    def GetPdfInfo(self, station_id):
        # set post data
        params = {}
        params['Type'] = 'pdf'
        params['ID'] = station_id

        # set url
        url = self.checkdraft_url

        # get request
        ret = self.session.get(url, params=params)

        if ret.status_code == 200:
            download_url = Selector(
                text=ret.text).xpath('//a/@href').extract()[0]
            file_name = Selector(
                text=ret.text).xpath('//a/text()').extract()[0]

            pdf_info = {
                'download_url': download_url,
                'file_name': file_name.split('/')[1]
            }

            print(pdf_info)
            return pdf_info
        else:
            print('failed to get pdf information')
            return None

예제 #13

0

파일 보기

파일: private_honor.py 프로젝트: satori-koishi/Spider_All

    def change_person_data(self, response):
        """变更记录表数据"""
        change_data = {}
        # 发送变更记录表
        name = '变更记录'
        grade = Selector(response=response).xpath('//tbody/tr/td[1]/text()').extract_first()
        grade = grade.split(' ')[0]
        if grade == '暂未查询到已登记入库信息':
            print('zzzzzzzzzzzzzzzzzzz')
        else:
            change_data['grade'] = grade
            now_company = Selector(response=response).xpath('//div[@class="curQy"]/span/text()').extract_first()
            change_data['now_company'] = now_company

            change_record = Selector(response=response).xpath('//ul[@class="cbp_tmtimeline"]/li')
            for c in change_record:
                year = c.xpath('./div[1]/span[1]/text()').extract_first()
                month_day = c.xpath('./div[1]/span[2]/text()').extract_first()
                # 时间
                date = year + month_day
                change_data['date'] = date

                # 原来的公司
                original_company= c.xpath('./div[@class="cbp_tmlabel"]/p/span[1]/text').extract_first()
                change_data['original_company'] = original_company

                # 现在的公司
                now_z_company = c.xpath('./div[@class="cbp_tmlabel"]/p/span[2]/text').extract_first()
                change_data['now_z_company'] = now_z_company

예제 #14

0

파일 보기

 def achivment(self, response):
     """不良行为"""
     change_data = {}
     # 发送变更记录表
     person_name = response.meta['item']['name']
     name = '个人功绩'
     print(name, 'zz')
     change_data['person_name'] = person_name
     grade = Selector(response=response).xpath(
         '//tbody/tr/td[1]/text()').extract_first()
     grade = grade.split(' ')[0]
     if grade == '暂未查询到已登记入库信息':
         print('暂时无数据')
     else:
         content = Selector(response=response).xpath('//tbody/tr')
         for c in content:
             td = c.xpath('./td')
             merit = {}
             for t in td:
                 field_name = t.xpath('@data-header').extract_first()
                 # ''.split()[0]
                 # print(field_name)
                 field_name = field_name.split()[0]
                 if field_name == '诚信记录编号':
                     value = t.xpath('./span/text()').extract_first()
                     merit['serial_number'] = value
                 elif field_name == '诚信记录主体':
                     value = t.xpath('./a/text()').extract_first()
                     value = value.split()[0]
                     merit['person_name'] = value
                 elif field_name == '决定内容':
                     result = t.xpath('text()')[1].extract()
                     result = result.split()[0]
                     merit['result'] = result
                     what_action = t.xpath(
                         './div/span[1]/text()').extract_first()
                     what_action = what_action.replace('【', '')
                     what_action = what_action.replace('】', '')
                     merit['what_action'] = what_action
                     start_date = t.xpath(
                         './div/span[2]/text()').extract_first()
                     start_date = start_date.slipt('：')[1]
                     merit['start_date'] = start_date
                     department = t.xpath(
                         './div/a/@data-no').extract_first()
                     merit['department'] = department
                     d_content = t.xpath(
                         './div/a/@data-text').extract_first()
                     merit['content'] = d_content
                 elif field_name == '实施部门（文号）':
                     value = t.xpath('text()').extract_first()
                     value = value.split()[0]
                     merit['a_department'] = value
                 elif field_name == '发布有效期':
                     value = t.xpath('text()').extract_first()
                     merit['project_type'] = value
             merit['token'] = self.token
             merit['corporate_name'] = self.corporate_name
             print(merit)

예제 #15

0

파일 보기

 def company_information(self, response):
     company_name = Selector(response=response).xpath(
         '//td[@class="name_level3"]/text()').extract_first()
     number = Selector(response=response).xpath(
         '//td[@id="LicenseNum"]/text()').extract_first()
     company_name = company_name.split()[0]
     if number is not None:
         number = number.split()[0]
         number = number
     else:
         number = ''
     cc = 'http://218.60.144.163/LNJGPublisher/handle/Corp_Project.ashx?' \
          'CorpCode=%s&CorpName=%s&nPageCount=0&nPageIndex=1&nRecordSetCount=0&nPageSize=%s&_=1558580207472' \
          % (number, company_name, 100)
     yield scrapy.Request(url=cc,
                          callback=self.project,
                          meta={'companyName': company_name})

예제 #16

0

파일 보기

            def xt(cls, response):
                bio = {
                    'birthdate': None,
                    'birthplace': '',
                    'deathdate': None,
                    'deathplace': '',
                    'occupation': ''
                }
                bio_data = response.xpath(cls.XPATH).extract()
                if bio_data:
                    bio_data = bio_data[0]
                else:
                    return bio

                # Birth Data
                for data in bio_data.split('<br>'):
                    birth = Selector(text=data)\
                        .xpath("//em[contains(text(),'Geb.')]/parent::*/text()")\
                        .extract()
                    if birth:
                        birth = birth[0]
                        bio['birthdate'] = _clean(birth.split(',')[0])
                        try:
                            bio['birthdate'] = datetime.datetime.strptime(
                                bio['birthdate'], "%d.%m.%Y").date()
                        except:
                            logger.error(
                                "Failed to parse birthdate: {}".format(
                                    bio['birthdate']))
                            bio['birthdate'] = None
                        if len(birth.split(',')) > 1:
                            bio['birthplace'] = birth.split(',')[1]

                    # Death Data
                    death = Selector(text=data)\
                        .xpath("//em[contains(text(),'Verst.')]/parent::*/text()")\
                        .extract()
                    if death:
                        death = death[0]
                        bio['deathdate'] = _clean(death.split(',')[0])
                        try:
                            bio['deathdate'] = datetime.datetime.strptime(
                                bio['deathdate'], "%d.%m.%Y").date()
                        except:
                            logger.error(
                                "Failed to parse deathdate: {}".format(
                                    bio['deathdate']))
                            bio['deathdate'] = None
                        if len(death.split(',')) > 1:
                            bio['deathplace'] = death.split(',')[1]

                    # Occupation
                    occupation = Selector(text=data)\
                        .xpath("//em[contains(text(),'Beruf')]/parent::*/text()")\
                        .extract()
                    if occupation:
                        occupation = occupation[0]
                        bio['occupation'] = occupation.split(',')[0]
                return bio

예제 #17

0

파일 보기

파일: private_change.py 프로젝트: satori-koishi/Spider_All

    def change_person_data(self, response):
        """变更记录表数据"""
        change_data = {}
        # 发送变更记录表
        person_name = response.meta['item']['name']
        name = '变更记录'
        print(name, 'zz')
        change_data['person_name'] = person_name
        grade = Selector(response=response).xpath(
            '//tbody/tr/td[1]/text()').extract_first()
        grade = grade.split(' ')[0]
        if grade == '暂未查询到已登记入库信息':
            print('暂时无数据')
        else:
            change_data['grade'] = grade
            now_company = Selector(response=response).xpath(
                '//div[@class="curQy"]/span/text()').extract_first()
            change_data['now_company'] = now_company

            change_record = Selector(
                response=response).xpath('//ul[@class="cbp_tmtimeline"]/li')
            # print(change_record)
            myset = set()
            for c in change_record:
                year = c.xpath('./div[1]/span[1]/text()').extract_first()
                month_day = c.xpath('./div[1]/span[2]/text()').extract_first()
                year = year.split('年')[0]
                month_day = month_day.split('/')
                month = month_day[0]
                day = month_day[1]

                # 时间
                date = year + month + day
                date = int(date)
                date = date * 10 + 1
                if date in myset:
                    date += 1
                myset.add(date)

                change_data['not_data'] = date

                # 原来的公司
                original_company = c.xpath(
                    './div[@class="cbp_tmlabel"]/p/span[1]/text()'
                ).extract_first()
                change_data['original_company'] = original_company

                # 现在的公司
                now_z_company = c.xpath(
                    './div[@class="cbp_tmlabel"]/p/span[2]/text()'
                ).extract_first()
                change_data['now_z_company'] = now_z_company
                change_data['name_company'] = self.corporate_name
                print(change_data)
                print('一条信息')

예제 #18

0

파일 보기

 def company_information(self, response):
     data = {}
     td = Selector(response=response).xpath('//div[@id="ent-info "]/div[2]/div/h5/text()').extract_first()
     company_name = Selector(response=response).xpath('//div[@class="ln-title"]/text()').extract_first()
     company_name = company_name.split()[0]
     data['companyName'] = company_name
     data['area'] = '广东省'
     data['companyArea'] = ''
     data['token'] = self.token
     number = td.split()
     if number != []:
         number = number[0]
         if len(number) != 18:
             data['licenseNum'] = ''
         else:
             data['licenseNum'] = number
     else:
         data['licenseNum'] = ''
     div_person = Selector(response=response).xpath('//div[@id="ent-into"]/div')
     # print(len(div_person),'sssssssssssssssssssss')
     if len(div_person) == 2:
         data['contactMan'] = ''
         data['contactAddress'] = ''
         data['contactPhone'] = ''
         print('无人员注入')
     else:
         address = div_person[3].xpath('./div/h5/text()').extract_first()
         person_name = div_person[4].xpath('./div/h5/text()').extract_first()
         phone_number = div_person[5].xpath('./div/h5/text()').extract_first()
         person_name = person_name.split()[0]
         address = address.split()[0]
         phone_number = phone_number.split()[0]
         data['contactMan'] = person_name
         data['contactAddress'] = address
         data['phone_number'] = phone_number
     print(data)
     return Request(url='http://192.168.199.188:8080/web/rest/companyInfo/addCompanyRecord.htm',
                                 method="POST",
                                 headers={'Content-Type': 'application/json'},
                                 body=json.dumps(data),
                                 callback=self.zz
                   )

예제 #19

0

파일 보기

파일: game_gamersky_spider.py 프로젝트: rizkyxp/spider

 def parse_basic_info(self, response):
     contents = response.xpath(
         '//ul[@class="down_con downData"]//li').extract()
     for content in contents:
         img = Selector(text=content).xpath(
             '//li//div[@class="img"]/a/img/@src').extract()[0]
         if img != '' and img != None and img != [] and not img.startswith(
                 'http'):
             img = 'http://img4.gamersky.com/Files/GamerSky/' + img
         name = Selector(text=content).xpath(
             '//li//div[@class="img"]/a/@title').extract()[0]
         url = Selector(text=content).xpath(
             '//li//div[@class="img"]/a/@href').extract()[0]
         update_time = Selector(text=content).xpath(
             '//li//div[@class="txt"][1]/text()').extract()[0]
         if update_time != None:
             update_time = update_time.split('：')[1]
         type = Selector(text=content).xpath(
             '//li//div[@class="txt"][2]/text()').extract()[0]
         if type != None:
             type = type.split('：')[1]
         language = Selector(text=content).xpath(
             '//li//div[@class="txt"][3]/text()').extract()[0]
         if language != None:
             language = language.split('：')[1]
         size = Selector(text=content).xpath(
             '//li//div[@class="txt"][4]/text()').extract()[0]
         if size != None:
             size = size.split('：')[1]
         yield scrapy.Request(url=url,
                              headers=self.default_headers,
                              body=self.default_data,
                              callback=self.parse_detail_info,
                              meta={
                                  'img': img,
                                  'name': name,
                                  'update_time': update_time,
                                  'type': type,
                                  'language': language,
                                  'size': size
                              },
                              dont_filter=True)

예제 #20

0

파일 보기

파일: person.py 프로젝트: 11i4/OffenesParlament

            def xt(cls, response):
                bio = {
                    'birthdate': None,
                    'birthplace': '',
                    'deathdate': None,
                    'deathplace': '',
                    'occupation': ''
                }
                bio_data = response.xpath(cls.XPATH).extract()
                if bio_data:
                    bio_data = bio_data[0]
                else:
                    return bio

                # Birth Data
                for data in bio_data.split('<br>'):
                    birth = Selector(text=data)\
                        .xpath("//em[contains(text(),'Geb.')]/parent::*/text()")\
                        .extract()
                    if birth:
                        birth = birth[0]
                        bio['birthdate'] = _clean(birth.split(',')[0])
                        try:
                            bio['birthdate'] = datetime.datetime.strptime(
                                bio['birthdate'], "%d.%m.%Y").date()
                        except:
                            logger.error("Failed to parse birthdate: {}".format(
                                bio['birthdate']))
                            bio['birthdate'] = None
                        if len(birth.split(',')) > 1:
                            bio['birthplace'] = birth.split(',')[1]

                    # Death Data
                    death = Selector(text=data)\
                        .xpath("//em[contains(text(),'Verst.')]/parent::*/text()")\
                        .extract()
                    if death:
                        death = death[0]
                        bio['deathdate'] = _clean(death.split(',')[0])
                        try:
                            bio['deathdate'] = datetime.datetime.strptime(
                                bio['deathdate'], "%d.%m.%Y").date()
                        except:
                            logger.error("Failed to parse deathdate: {}".format(
                                bio['deathdate']))
                            bio['deathdate'] = None
                        if len(death.split(',')) > 1:
                            bio['deathplace'] = death.split(',')[1]

                    # Occupation
                    occupation = Selector(text=data)\
                        .xpath("//em[contains(text(),'Beruf')]/parent::*/text()")\
                        .extract()
                    if occupation:
                        occupation = occupation[0]
                        bio['occupation'] = occupation.split(',')[0]
                return bio

예제 #21

0

파일 보기

파일: JiLingProject.py 프로젝트: satori-koishi/Spider_All

    def company_information(self, response):
        company_name = Selector(response=response).xpath(
            '//td[@class="name_level3"]/text()').extract_first()
        company_name = company_name.split()[0]
        basic_url = 'http://cx.jlsjsxxw.com/handle/Corp_Project.ashx?corpid=%s&_=1556177544518' % response.meta[
            'cc']

        yield scrapy.Request(url=basic_url,
                             callback=self.project,
                             dont_filter=True,
                             meta={'companyName': company_name})

예제 #22

0

파일 보기

 def bad_recode(self,response):
     """不良行为"""
     content = Selector(response=response).xpath('//tbody/tr')
     if not content.xpath('./td/text()').extract_first() == "暂未查询到已登记入库信息":
         print(content.xpath('./td/text()').extract_first(), '不良行为相关信息')
         for c in content:
             td = c.xpath('./td')
             not_good = {}
             for t in td:
                 h = t.xpath('@data-header').extract_first()
                 h = h.split()[0]
                 if h == "诚信记录编号":
                     d = t.xpath('./span/text()').extract_first()
                     print("诚信记录编号", d)
                     d = d.split()[0]
                     not_good['creditNum'] = d
                 elif h == "诚信记录主体":
                     d = t.xpath('./a/text()').extract_first()
                     print("诚信记录主体", d)
                     d = d.split()[0]
                     not_good['companyName'] = d
                 elif h == "决定内容":
                     d = t.xpath('./div/span[2]/text()').extract_first()
                     d = d.split('：')[1]
                     print("决定内容", d)
                     not_good['beginDate'] = d
                     content = t.xpath('./div/a/@data-text').extract_first()
                     content = content.split()[0]
                     print("决定内容", content)
                     not_good['fileContent'] = content
                     result = t.xpath('text()')[1].extract()
                     result = result.split()[0]
                     not_good['mark'] = result
                     print("决定内容", result)
                 elif h == "实施部门（文号）":
                     address = t.xpath('text()').extract_first()
                     address = address.split()[0]
                     print('实施部门（文号）', address)
                     not_good['departName'] = address
                     number = t.xpath('./div/text()').extract_first()
                     number = number.split()[0]
                     print('实施部门（文号）', number)
                     not_good['fileNum'] = number
                 elif h == "发布有效期":
                     t = t.xpath('text()').extract_first()
                     t = t.split()[0]
                     not_good['endDate'] = t
                     print('发布有效期', t)
                 not_good['token'] = self.token
             yield Request(url=self.tongnie, method="POST", body=json.dumps(not_good), headers={'Content-Type': 'application/json'}, callback=self.zz)
             print('发送成功----', not_good)
     else:
         print(self.corporate_name, '--没有--', self.action, '这个相关的记录')

예제 #23

0

파일 보기

파일: LiaoNingOtherProvince.py 프로젝트: satori-koishi/Spider_All

    def company_information(self, response):
        company_name = Selector(response=response).xpath(
            '//td[@class="name_level3"]/text()').extract_first()
        number = Selector(response=response).xpath(
            '//td[@id="CorpCode"]/text()').extract_first()
        person = Selector(
            response=response).xpath('//td[@id="Td4"]/text()').extract_first()
        company_name = company_name.split()[0]
        repeat = self.r.sadd('Company_name', company_name + '辽宁省')
        if repeat:
            self.data['companyName'] = company_name
            if person is None:
                self.data['contactPhone'] = ''
            else:
                person = person.split()[0]
                self.data['contactPhone'] = person
            if number is not None:
                number = number.split()[0]
                if len(number) != 18:
                    self.data['licenseNum'] = ''
                else:
                    self.data['licenseNum'] = number
            else:
                self.data['licenseNum'] = ''
            print(self.data)

            yield scrapy.Request(
                url=
                'https://api.maotouin.com/rest/companyInfo/addCompanyRecord.htm',
                # url='http://192.168.199.188:8080/web/rest/companyInfo/addCompanyRecord.htm',
                method="POST",
                headers={'Content-Type': 'application/json'},
                body=json.dumps(self.data),
                callback=self.zz,
                meta={
                    'company_name': company_name,
                    'data': self.data
                })
        else:
            print('此公司信息已经存在', company_name)

예제 #24

0

파일 보기

파일: ShanXiProject.py 프로젝트: satori-koishi/Spider_All

    def company_information(self, response):
        company_name = Selector(response=response).xpath('//td[@colspan="3"]/text()').extract_first()
        number = Selector(response=response).xpath('//table[@class="detailTable"]')[0] \
            .xpath('./tr[2]/td[4]/text()').extract_first()
        company_name = company_name.split()[0]
        # repeat = self.r.sadd('Company_name', company_name)
        repeat = 1
        if repeat != 0:
            if number.split():
                number = number[0]
                if len(number) == 18:
                    number = number
            else:
                number = ''

            project_info = Selector(response=response).xpath('//table[@class="detailTable"]')[4].xpath('./tr')
            title = project_info[0].xpath('./td/text()').extract()[0]
            if title == '项目信息（0个）':
                print('没有项目的公司--%s' % company_name)
            else:
                print('当前公司%s----项目%s' % (company_name, title))
                # print(len(project_info))
                project_info = project_info[2:]
                print(len(project_info), 'BBBBBBBBBBBBBBBBBB')
                for p in project_info:
                    project_url = p.xpath('./td[2]/p/a/@onclick').extract_first()
                    xx = 'window.open\(\'/(.*)\', \'dasfddd.*|window.open\(\'/(.*)\', \'fdsafa.*'
                    cc = re.findall(xx, project_url)[0]
                    if cc[0]:
                        url = cc[0]
                    else:
                        url = cc[1]
                    yield scrapy.Request(url='http://jzscyth.shaanxi.gov.cn:7001/' + url,
                                         callback=self.company_project,
                                         meta={'company_name': company_name, 'number': number},
                                         dont_filter=True
                                         )
        else:
            print('此公司信息已经存在', company_name)

예제 #25

0

파일 보기

 def company_information(self, response):
     company_name = Selector(response=response).xpath(
         '//span[@id="ctl00_ContentPlaceHolder1_FormView1_Label10"]/text()'
     ).extract_first()
     number = Selector(response=response).xpath('//td[@class="inquiry_intitleb"]')[5] \
         .xpath('./span/text()').extract_first()
     if number is not None:
         number = number.split()[0]
         if len(number) == 18:
             number = number
         else:
             number = ''
     company_name = company_name.split()[0]
     xx = 'http://hngcjs.hnjs.gov.cn/SiKuWeb/Gcxm.aspx?CorpName=%s&CorpCode=%s' % (
         company_name, number)
     print(xx)
     yield scrapy.Request(url=xx,
                          callback=self.project,
                          meta={
                              'company_name': company_name,
                              'number': number,
                              'page': 1
                          })

예제 #26

0

파일 보기

    def company_information(self, response):
        company_name = Selector(response=response).xpath('//span[@class="user-name"]/text()').extract_first()
        number = Selector(response=response).xpath('//div[@class="bottom"]/dl/dt/text()').extract_first()
        company_name = company_name.split()[0]
        if number != None:
            number = number.split()[0]
            if len(number) != 18:

                self.data['licenseNum'] = ''
            else:
                self.data['licenseNum'] = number
        else:
            self.data['licenseNum'] = ''
        self.data['companyName'] = company_name
        print(self.data)
        yield scrapy.Request(
            url='https://api.maotouin.com/rest/companyInfo/addCompanyRecord.htm',
            # url='http://192.168.199.188:8080/web/rest/companyInfo/addCompanyRecord.htm',
            method="POST",
            headers={'Content-Type': 'application/json'},
            body=json.dumps(self.data),
            callback=self.zz,
            meta={'company_name': company_name,'data':self.data}
        )

예제 #27

0

파일 보기

파일: XinJiangProject.py 프로젝트: satori-koishi/Spider_All

 def all_project(self, response):
     xx = response.url
     vv = 'http://jsy.xjjs.gov.cn/dataservice/query/comp/compDetail/(.*)'
     cc = re.findall(vv, xx)[0]
     company_name = Selector(response=response).xpath('//span[@class="user-name"]/text()').extract_first()
     number = Selector(response=response).xpath('//div[@class="bottom"]/dl/dt/text()').extract_first()
     company_name = company_name.split()[0]
     url = 'http://jsy.xjjs.gov.cn/dataservice/query/comp/compPerformanceListSys/' + cc
     send_data = {'$total': '100',
                  '$pgsz': '100',
                  '$pg': '1',
                  '$reload': '0',
                  }
     yield scrapy.FormRequest(url=url, formdata=send_data, callback=self.project_info,
                              meta={"company_name": company_name, "number": number}
                              )

예제 #28

0

파일 보기

파일: profile.py 프로젝트: uktrade/directory-tests

def get_case_studies_details(response: Response):
    content = response.content.decode("utf-8")
    article_selector = "div.card"
    articles = Selector(text=content).css(article_selector).extract()
    result = []
    for article in articles:
        title = Selector(text=article).css("h3::text").extract()[0]
        summary = Selector(
            text=article).css("p.description::text").extract()[0]
        href = Selector(text=article).css("a::attr(href)").extract()[0]
        slug = href.split("/")[-2]
        assert slug, f"Couldn't extract case study slug from {article}"
        logging.debug("Got case study slug: %s", slug)
        result.append((title, summary, href, slug))
    assert result, f"No Case Study details extracted from {articles}"
    return result

예제 #29

0

파일 보기

파일: mia_parse.py 프로젝트: mylove1/python

    def _get_p_info(self, **kwargs):
        body = kwargs.get('body', '')

        tmp_p_info = Selector(
            text=body).css('div.showblock div p').extract_first()
        if tmp_p_info == '':
            return []
        else:
            tmp_p_info = re.compile('<p>|</p>').sub('', tmp_p_info)
            tmp_p_info = re.compile(r'<!--思源品牌，隐藏品牌-->').sub('', tmp_p_info)
            p_info = [{
                'p_name': item.split('：')[0],
                'p_value': item.split('：')[1]
            } for item in tmp_p_info.split('<br>') if item != '']

        return p_info

예제 #30

0

파일 보기

 def achivment(self, response):
     """个人功绩"""
     change_data = {}
     person_name = response.meta['item']['name']
     name = '个人功绩'
     print(name, 'zz')
     change_data['person_name'] = person_name
     grade = Selector(response=response).xpath(
         '//tbody/tr/td[1]/text()').extract_first()
     grade = grade.split(' ')[0]
     if grade == '暂未查询到已登记入库信息':
         print('暂时无数据')
     else:
         content = Selector(response=response).xpath('//tbody/tr')
         for c in content:
             td = c.xpath('./td')
             merit = {}
             for t in td:
                 field_name = t.xpath('@data-header').extract_first()
                 # ''.split()[0]
                 # print(field_name)
                 field_name = field_name.split()[0]
                 if field_name == '序号':
                     value = t.xpath('text()').extract_first()
                     merit['serial_number'] = value
                 elif field_name == '项目编码':
                     value = t.xpath('text()').extract_first()
                     merit['project_recode'] = value
                 elif field_name == '项目名称':
                     value = t.xpath('./a/text()').extract_first()
                     merit['project_name'] = value
                 elif field_name == '项目属地':
                     value = t.xpath('text()').extract_first()
                     merit['project_address'] = value
                 elif field_name == '项目类别':
                     value = t.xpath('text()').extract_first()
                     merit['project_type'] = value
                 elif field_name == '建设单位':
                     value = t.xpath('text()').extract_first()
                     merit['project_Company'] = value
             merit['token'] = self.token
             merit['corporate_name'] = self.corporate_name
             print(merit)

예제 #31

0

파일 보기

파일: szse_zqxx_jlcf_spider.py 프로젝트: studyaa/project

 def parse_basic_info(self, response):
     contents = response.xpath('//table[@id="REPORTID_tab1"]//tr').extract()[1:]
     for content in contents:
         company_name = Selector(text=content).xpath('//tr/td[1]/text()').extract()[0]
         writ_no = Selector(text=content).xpath('//tr/td[3]/text()').extract()[0]
         pub_type = Selector(text=content).xpath('//tr/td[4]//a/text()').extract()[0]
         writ_date = Selector(text=content).xpath('//tr/td[5]/text()').extract()[0]
         rel_bond = Selector(text=content).xpath('//tr/td[6]/text()').extract()[0]
         start = '给予'
         end = '的决定'
         pub_type = (pub_type.split(start))[1].split(end)[0]
         item = SszeLoaderItem(item=SszeZqxxJlcfResultItem(), response=response)
         item.add_value('batch_date', self.batch_date)
         item.add_value('company_name', company_name)
         item.add_value('writ_no', writ_no)
         item.add_value('pub_type', pub_type)
         item.add_value('writ_date', writ_date)
         item.add_value('rel_bond', rel_bond)
         item.add_value('table_name', 'spider.szse_zqxx_jlcf_result')
         yield item.load_item()