Python clear_lianxu_space 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: scrapySchool_England_Ben.clearSpace

메소드/함수: clear_lianxu_space

hotexamples.com에서의 예제들: 30

Python clear_lianxu_space - 30개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 scrapySchool_England_Ben.clearSpace.clear_lianxu_space에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: Queen'sUniversityBelfast_U.py 프로젝트: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Queen's University Belfast"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:

            alevel = response.xpath(
                "//b[contains(text(),'A level requirements')]/..//text()"
            ).extract()
            if len(alevel) > 0:
                item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//b[contains(text(),'International Baccalaureate Diploma')]/..//text()"
            ).extract()
            item['ib'] = clear_lianxu_space(ib)
            print("item['ib'] = ", item['ib'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'w',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #2

파일 보기

파일: NewcastleUniversity_U.py 프로젝트: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Newcastle University"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:
            alevel = response.xpath(
                "//h3[@class='expandable-is-set'][contains(text(),'A Levels')]/../../div")
            print(alevel)
            if len(alevel) > 0:
                item['alevel'] = clear_lianxu_space(alevel[0].xpath("..//text()").extract())
            print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//h3[@class='expandable-is-set'][contains(text(),'International Baccalaureate')]/../../div//text()").extract()
            item['ib'] = clear_lianxu_space(ib)
            print("item['ib'] = ", item['ib'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #3

파일 보기

파일: TheUniversityOfManchester_U.py 프로젝트: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "The University of Manchester"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])

        try:
            # alevel = response.xpath(
            #     "//h3[@id='a-level']/following-sibling::*[1]//text()").extract()
            alevel = response.xpath(
                "//h3[@id='a-level']/following-sibling::*[1]//text()").extract()
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//h3[@id='international-baccalaureate']/following-sibling::*[1]//text()").extract()
            item['ib'] = clear_lianxu_space(ib)
            print("item['ib'] = ", item['ib'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/"+item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #4

파일 보기

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "King's College London"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:

            alevel = response.xpath(
                # "//table//*[contains(text(),'A-level')]/../..//text()|//table//*[contains(text(),'A-Level')]/../..//text()"
                "//div[@class='further-information']//table//tbody/tr[1]//text()"
            ).extract()
            # if len(alevel) == 0:
            #     alevel = response.xpath(
            #         "//strong[contains(text(),'A-Level')]/../following-sibling::td[1]//text()").extract()
            #     if len(alevel) == 0:
            #         alevel = response.xpath(
            #             "//div[@class='requirements EntryReqs_UKALevel clearfix']//div[@class='required-grades']//text()//text()").extract()
            # clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = clear_lianxu_space(alevel)
                # print("item['alevel']1 = ", item['alevel'])

            print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                # "//*[contains(text(),'International Baccalaureate')]/../..//text()"
                "//div[@class='further-information']//table//tbody/tr[8]//text()"
            ).extract()
            # if len(ib) == 0:
            #     ib = response.xpath(
            #         "//*[contains(text(),'International Baccalaureate')]/../../..//text()").extract()
            if len(ib) > 0:
                item['ib'] = clear_lianxu_space(ib)
            print("item['ib'] = ", item['ib'])

            if "All candidates" in item['alevel']:
                alevel = response.xpath(
                    "//div[@class='further-information']//table//tbody/tr[2]//text()"
                ).extract()
                item['alevel'] = clear_lianxu_space(alevel)

                ib = response.xpath(
                    "//b[contains(text(),'International Baccalaureate')]/../../..//text()"
                ).extract()
                item['ib'] = clear_lianxu_space(ib)

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #5

파일 보기

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Surrey"
        item['url'] = response.meta['url']
        print("===============================")
        print(response.url)
        print(response.meta['url'])

        try:
            tuition_fee = response.xpath(
                "//div[@id='fees']//tbody//tr[1]/td[last()-1]//text()"
            ).extract()
            # print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))

            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
            else:
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee'] = ", item['tuition_fee'])
            print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            alevel = response.xpath(
                "//h3[contains(text(),'A-level')]/..//text()").extract()
            # alevel_str = ''.join(alevel).strip()
            # if alevel_str == "Overall:" or alevel_str == "Overall":
            #     alevel = response.xpath("//h3[contains(text(),'A-level')]/following-sibling::*[position()<4]//text()").extract()
            #     alevel_str = ''.join(alevel).replace("Overall", "").strip().strip(":").strip()
            # print("***alevel")
            item['alevel'] = clear_lianxu_space(alevel)
            # print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//h3[contains(text(),'International Baccalaureate')]/..//text()"
            ).extract()
            # ib_str = ''.join(ib).strip()
            # if ib_str == "Overall:":
            #     ib = response.xpath("//h3[contains(text(),'International Baccalaureate')]/following-sibling::*[2]//text()").extract()
            #     ib_str = ''.join(ib).strip()
            #     # print("***ib")
            item['ib'] = clear_lianxu_space(ib)
            # print("item['ib'] = ", item['ib'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #6

파일 보기

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Bolton"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:
            # ucas_point = response.xpath("//li[@class='iconim points']//b[contains(text(),'UCAS points:')]/../span//text()").extract()
            # print("ucas_point: ", ucas_point)

            alevel = response.xpath(
                "//li[@class='iconim points']//b[contains(text(),'UCAS points:')]/../span//text()"
            ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel']: ", item['alevel'])

            yield item

        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #7

파일 보기

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Southampton Solent University"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:
            item['alevel'] = None
            alevel = response.xpath(
                "//h4[contains(text(),'UCAS tariff points from A-levels')]//text()|"
                "//li[contains(text(), 'A-level')]//text()|//li[contains(text(), 'A Level')]//text()"
            ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel']: ", item['alevel'])
            if item['alevel'] == "":
                print("****111")

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #8

파일 보기

파일: UniversityOfSurrey_U.py 프로젝트: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Surrey"
        item['url'] = response.meta['url']
        # 学位类型
        # item['degree_type'] = 1
        # item['location'] = '01SE01, Senate House, University of Surrey, Guildford, Surrey GU2 7XH'
        # print("item['location'] = ", item['location'])
        print("===============================")
        print(response.url)
        print(response.meta['url'])

        try:

            alevel = response.xpath(
                "//h3[contains(text(),'A-level')]/..//text()").extract()
            # alevel_str = ''.join(alevel).strip()
            # if alevel_str == "Overall:" or alevel_str == "Overall":
            #     alevel = response.xpath("//h3[contains(text(),'A-level')]/following-sibling::*[position()<4]//text()").extract()
            #     alevel_str = ''.join(alevel).replace("Overall", "").strip().strip(":").strip()
            # print("***alevel")
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//h3[contains(text(),'International Baccalaureate')]/..//text()"
            ).extract()
            # ib_str = ''.join(ib).strip()
            # if ib_str == "Overall:":
            #     ib = response.xpath("//h3[contains(text(),'International Baccalaureate')]/following-sibling::*[2]//text()").extract()
            #     ib_str = ''.join(ib).strip()
            #     # print("***ib")
            item['ib'] = clear_lianxu_space(ib)
            print("item['ib'] = ", item['ib'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #9

파일 보기

    def parse_rntry_requirements(self, parse_entry_url):
        data = requests.get(parse_entry_url, headers=self.headers)
        response = etree.HTML(data.text)
        entry_dict = {}
        rntry_requirements = response.xpath(
            "//div[@class='col col1 article-content']/div//text()")
        rntry_requirements_str = clear_lianxu_space(rntry_requirements)
        rep = re.findall(r"//<!--.*//-->", rntry_requirements_str)
        # print(rep, "======")
        rntry_requirements_str = rntry_requirements_str.replace(
            ''.join(rep), '')

        ielts_desc = response.xpath(
            "//h2[contains(text(),'IELTS')]/..//text()|//strong[contains(text(),'IELTS')]/../..//text()|"
            "//td[contains(text(),'IELTS')]/following-sibling::td[1]//text()")
        # print("ielts_desc: ", ielts_desc)
        # ielts_desc = ' '.join(ielts_desc).replace("\n", "").replace("\r", "").replace('\t', "").replace("  ", "").strip()
        ielts_desc_str = clear_lianxu_space(ielts_desc)

        alevel = response.xpath(
            "//*[contains(text(),'A Level')]/../../following-sibling::*//*[contains(text(), 'Grades')]/following-sibling::td//text()|"
            "//*[contains(text(),'A Level')]/../following-sibling::*//*[contains(text(), 'Grades')]/following-sibling::td//text()"
        )
        alevel_str = ""
        if len(alevel) > 0:
            alevel_str = alevel[-1]
        # print("ielts_desc: ", ielts_desc)

        ib = response.xpath(
            "//*[contains(text(),'International Baccalaureate')]/../../following-sibling::*//*[contains(text(), 'Grades')]/following-sibling::td//text()|"
            "//*[contains(text(),'International Baccalaureate')]/../following-sibling::*//*[contains(text(), 'core')]/following-sibling::td//text()"
        )
        ib_str = ""
        # print(ib)
        # if len(ib) > 0:
        ib_str = ' '.join(ib).strip()
        # print("ielts_desc: ", ielts_desc)
        entry_dict['rntry_requirements'] = rntry_requirements_str
        entry_dict['ielts_desc'] = ielts_desc_str
        entry_dict['alevel'] = alevel_str
        entry_dict['ib'] = ib_str
        return entry_dict

예제 #10

파일 보기

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Aston University"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:

            alevel = response.xpath(
                "//div[contains(text(),'A level')]/..//text()|//div[contains(text(),'B – B')]/..//text()|"
                "//div[@class='course-details__dt'][contains(text(),'A Level')]/..//text()|"
                "//div[contains(text(),'A-levels')]/..//text()").extract()
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel']: ", item['alevel'])

            ib = response.xpath(
                "//div[contains(text(),'International baacalaureate diploma')]/..//text()|"
                "//div[contains(text(),'International Baccalaureate')]/..//text()|"
                "//div[contains(text(),'International baccalaureate')]/..//text()"
            ).extract()
            if len(ib) == 0:
                ib = response.xpath(
                    "//strong[contains(text(),'International Baccalaureate')]/../following-sibling::*[1]//text()"
                ).extract()
            item['ib'] = clear_lianxu_space(ib)
            print("item['ib']: ", item['ib'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #11

파일 보기

파일: UniversityOfSalford_U.py 프로젝트: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Salford"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:

            alevel = response.xpath(
                "//*[contains(text(),'A level')]/following-sibling::td//text()"
            ).extract()
            if len(alevel) == 0:
                alevel = response.xpath(
                    "//*[contains(text(),'UCAS tariff points')]/following-sibling::td//text()"
                ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel']: ", item['alevel'])

            ib = response.xpath(
                "//*[contains(text(),'International Baccalaureate')]/following-sibling::td//text()"
            ).extract()
            item['ib'] = clear_lianxu_space(ib)
            print("item['ib']: ", item['ib'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #12

파일 보기

파일: CanterburyChristChurchUniversity_Alevel_Ib.py 프로젝트: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Canterbury Christ Church University"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:
            # ucas_point = response.xpath("//li[@class='iconim points']//b[contains(text(),'UCAS points:')]/../span//text()").extract()
            # print("ucas_point: ", ucas_point)

            alevel = response.xpath(
                "//h3[contains(text(),'Entry requirements')]/following-sibling::*//*[contains(text(), 'A level')]//text()|"
                "//h3[contains(text(),'Entry requirements')]/following-sibling::*//*[contains(text(), ' UCAS Tariff points')]//text()|"
                "//h3[contains(text(),'Entry requirements')]/following-sibling::*//*[contains(text(), 'UCAS points')]//text()|"
                "//h3[contains(text(),'Entry requirements')]/following-sibling::*//*[contains(text(), 'A typical offer')]//text()"
            ).extract()
            # del_re = re.findall(r"More entry requirement details.*", ''.join(alevel))
            # print("del_re: ", del_re)
            item['alevel'] = clear_lianxu_space(alevel).replace(
                "More entry requirement details", "").replace(".", "").strip()
            print("item['alevel']: ", item['alevel'])

            # ib = response.xpath(
            #     "//h5[contains(text(),'EU/International students')]/following-sibling::table//td[contains(text(),'International Baccalaureate')]/following-sibling::td//text()|"
            #     "//p[contains(text(),'International Baccalaureate')]//text()|"
            #     "//strong[contains(text(),'International Baccalaureate:')]/../span//text()").extract()
            # if len(ib) == 0:
            #     ib = response.xpath(
            #         "//td[contains(text(),'International Baccalaureate')]/following-sibling::td//text()").extract()
            # item['ib'] = clear_lianxu_space(ib)
            # print("item['ib']: ", item['ib'])

            yield item

        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #13

파일 보기

파일: BathSpaUniversity_U.py 프로젝트: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Bath Spa University"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:
            item['alevel'] = None
            alevel = response.xpath(
                "//h3[@class='title'][contains(text(),'Typical offers')]/following-sibling::div[1]//span[contains(text(),'A Level')]/..//text()|"
                "//h3[@class='title'][contains(text(),'Typical offers')]/../following-sibling::div[1]//li[contains(text(),'A Level')]//text()|"
                "//h3[@class='title'][contains(text(),'Typical offers')]/following-sibling::div[1]//span[contains(text(),'A-level')]/..//text()|"
                "//h3[@class='title'][contains(text(),'Typical offers')]/following-sibling::div[1]//span[contains(text(),'A-Level')]/..//text()"
                "//h3[@class='title'][contains(text(),'Typical offers')]/following-sibling::div[1]//li[contains(text(),'A-level')]//text()|"
                "//h3[@class='title'][contains(text(),'Typical offers')]/following-sibling::div[1]//li[contains(text(),'A Level')]//text()"
            ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel']: ", item['alevel'])
            if item['alevel'] == "":
                print("****alevel")

            item['ib'] = None
            ib = response.xpath(
                "//h3[@class='title'][contains(text(),'Typical offers')]/following-sibling::div[1]//span[contains(text(),'International Baccalaureate')]/..//text()|"
                "//h3[@class='title'][contains(text(),'Typical offers')]/following-sibling::div[1]//li[contains(text(),'International Baccalaureate')]//text()"
            ).extract()
            item['ib'] = ' '.join(ib).strip()
            print("item['ib']: ", item['ib'])
            if item['ib'] == "":
                print("****ib")

            yield item

        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #14

파일 보기

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Cardiff Metropolitan University"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:
            item['alevel'] = None
            # alevel = response.xpath(
            #     "//*[contains(text(),'A levels')]//text()|"
            #     "//*[contains(text(),'A Levels')]//text()").extract()
            alevel = response.xpath(
                "//h3[contains(text(), 'Entry Requirement')]/following-sibling::div[1]//*[contains(text(),'Degree')]/../following-sibling::p[1]//text()|"
                "//h3[contains(text(), 'Entry Requirement')]/following-sibling::div[1]//*[contains(text(),'Degree')]/..//following-sibling::ul[1]/li[1]//text()"
            ).extract()
            if len(alevel) == 0:
                alevel = response.xpath(
                    "//h3[contains(text(), 'Entry Requirement')]/following-sibling::div[1]/p[1]//text()|"
                    "//h3[contains(text(), 'Entry Requirement')]/following-sibling::div[1]//ul[1]/li[1]//text()"
                ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel']: ", item['alevel'])

            # ib = response.xpath(
            #     "//strong[contains(text(),'International Baccalaureate:')]/../text()").extract()
            # item['ib'] = clear_lianxu_space(ib)
            # print("item['ib']: ", item['ib'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #15

파일 보기

파일: UniversityOfPortsmouth_U.py 프로젝트: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Portsmouth"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:
            rntry_requirements_content = response.xpath(
                "//div[contains(text(),'Entry Requirements')]/../../..//div[contains(text(),'2019 start')]/../../../..//text()"
            ).extract()
            rntry_requirements_str = clear_lianxu_space(
                rntry_requirements_content)

            alevel = response.xpath(
                "//*[contains(text(),'A level')]/text()").extract()
            print("====", alevel)
            if len(alevel) == 0:
                alevel = re.findall(r".{1,45}A\slevels.{1,85}",
                                    rntry_requirements_str)
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[1:]).strip()
                if item['alevel'] == "":
                    item['alevel'] = ''.join(alevel).strip()
            print("item['alevel']: ", item['alevel'])

            # item["ib"] = "Most courses will require between 24 and 31 points in the International Baccalaureate (IB), depending on the degree you apply for."
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #16

파일 보기

파일: UniversityofWalesTrinitySaintDavid_Alevel_Ib.py 프로젝트: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Wales Trinity Saint David"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:
            # ucas_point = response.xpath("//li[@class='iconim points']//b[contains(text(),'UCAS points:')]/../span//text()").extract()
            # print("ucas_point: ", ucas_point)

            alevel = response.xpath(
                "//div[@id='collapseEntryCriteria']//*[contains(text(),'UCAS points')]//text()|"
                "//div[@id='collapseEntryCriteria']//*[contains(text(),'A Level')]//text()|"
                "//div[@id='collapseEntryCriteria']//*[contains(text(),'A level')]//text()"
            ).extract()
            if len(alevel) == 0:
                alevel = response.xpath(
                    "//*[contains(text(),'UCAS Points')]//text()|"
                    "//*[contains(text(),'UCAS points')]//text()|"
                    "//ul[@type='disc']/preceding-sibling::*[1]//text()|//ul[@type='disc']//text()"
                ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel']: ", item['alevel'])

            yield item

        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #17

파일 보기

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.bathspa.ac.uk/"
        item['university'] = "Bath Spa University"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            item['location'] = 'Bath'
            # 专业、学位类型//div[@class='masthead-inner']/div/div[@class='masthead-content']/h1
            programme = response.xpath(
                "//div[@class='masthead-inner']/div/div[@class='masthead-content']/h1//text()"
            ).extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath(
                "//div[@class='masthead-inner']/div/div[@class='masthead-content']/p[1]//text()"
            ).extract()
            item['degree_name'] = ''.join(degree_type).replace("(Hons)",
                                                               "").strip()
            print("item['degree_name']: ", item['degree_name'])

            # //dt[contains(text(),'School')]/following-sibling::dd[1]
            department = response.xpath(
                "//dt[contains(text(),'School')]/following-sibling::dd[1]//text()"
            ).extract()
            item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            location = response.xpath(
                "//dt[contains(text(),'Campus or location')]/following-sibling::dd[1]//text()"
            ).extract()
            item['location'] = ''.join(location).strip()
            # print("item['location']: ", item['location'])

            modules = response.xpath(
                "//h3[contains(text(),'Course structure')]/..|//h3[contains(text(),'Course modules')]/..|//h2[contains(text(),'Course modules')]/.."
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            career = response.xpath(
                "//h3[contains(text(),'Career')]/..").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            feeContent = response.xpath(
                "//h3[contains(text(),'International students full time')]/../div/table[1]//td[contains(text(), 'Year')]/following-sibling::td//text()"
            ).extract()
            clear_space(feeContent)
            # print(feeContent)
            if len(feeContent) > 0:
                item['tuition_fee'] = int(feeContent[0].replace(
                    "£", "").replace(",", "").strip())
            # print("item['tuition_fee']: ", item['tuition_fee'])

            alevel = response.xpath(
                "//span[contains(text(),'A Level')]/..//text()|//li[contains(text(),'A Level')]//text()"
            ).extract()
            item['alevel'] = ''.join(alevel).strip()
            # print("item['alevel']: ", item['alevel'])
            # if item['alevel'] == "":
            #     print("****alevel")

            ib = response.xpath(
                "//span[contains(text(),'International Baccalaureate')]/..//text()|"
                "//li[contains(text(),'International Baccalaureate')]//text()"
            ).extract()
            item['ib'] = ''.join(ib).strip()
            # print("item['ib']: ", item['ib'])
            # if item['ib'] == "":
            #     print("****ib")

            # //div[@class='content']/div[@class='collapsible-content highlighted']/div[2]/div[2]

            ieltsList = response.xpath(
                "//*[contains(text(),'IELTS')]//text()").extract()
            item['ielts_desc'] = ''.join(ieltsList).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            interview_desc_en = response.xpath(
                "//h3[contains(text(),'Interview and portfolio guidance')]/..|"
                "//h3[contains(text(),'Portfolio and interview')]/..").extract(
                )
            item['interview_desc_en'] = remove_class(
                clear_lianxu_space(interview_desc_en))
            # print("item['interview_desc_en']: ", item['interview_desc_en'])
            # if item['interview_desc_en'] == "":
            #     print("****interview_desc_en")

            portfolio_desc_en = response.xpath(
                "//h3[contains(text(),'Interview and portfolio guidance')]/..|"
                "//h3[contains(text(),'Portfolio')]/..").extract()
            item['portfolio_desc_en'] = remove_class(
                clear_lianxu_space(portfolio_desc_en))
            # print("item['portfolio_desc_en']: ", item['portfolio_desc_en'])
            # if item['portfolio_desc_en'] == "":
            #     print("****portfolio_desc_en")

            # https://www.bathspa.ac.uk/international/country-advice/china/

            item[
                'require_chinese_en'] = "<p><strong>Undergraduate</strong></p><ul><li>Senior Secondary School Graduation Certificate with a grade of 70% and a Foundation Certification from a recognised institution.</li></ul><p><strong>Undergraduate - Year 2 or 3 entry</strong></p><ul><li>Students with a Dazhuan Certificate will be considered for Year 3 entry on an individual basis.&nbsp;</li></ul>"

            # https://www.bathspa.ac.uk/applicants/how-to-apply/postgraduate/
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="intro-text">
	<p class="intro">We’re delighted you’re applying to study with us. The process is different based on your location and mode of study. Here’s what you need to do.</p>
</div><div class="rich-text" >
  <div data-hash-anchor='<a id="d.en.1281"></a>'></div>
    <div>
        <h2>UCAS applicants</h2>
<p>If you fit the following criteria, you’ll need to apply through the Universities and Colleges Admissions Service (UCAS):</p>
<ul>
<li>You’re applying directly out of sixth form or college;</li>
<li>You want to study full-time;</li>
<li>You don’t already hold an undergraduate qualification and are from the UK, EU or Channel Islands.</li>
</ul>
<p><strong>The official UCAS deadline for 2018/19 applications to any course: 15 January 2018.</strong></p>
<p>You’ll need some information from your course's webpage, including Bath Spa University’s institution code: BASPA B20.</p>
<p>Read more about <a href="/applicants/how-to-apply/undergraduate-and-foundation/how-to-apply-through-ucas/">how to </a><a href="/applicants/how-to-apply/undergraduate-and-foundation/how-to-apply-through-ucas/">apply through UCAS</a> or just get started. You’ll need to register or login to the UCAS site. &nbsp;</p>
<p><a href="https://www.ucas.com/ucas/undergraduate/ucas-undergraduate-apply-and-track">Apply via UCAS</a></p>
<h2>International applicants</h2>
<p>You can apply for one of our undergraduate courses online from the course’s webpage.&nbsp;You’ll be asked to create an online account.</p>
<p>Don’t have time to complete your whole application? Don’t worry, you can save your application and come back to it at anytime.</p>
<p>Alternatively, you can also <a href="https://www.ucas.com/ucas/undergraduate/ucas-undergraduate-apply-and-track">apply via UCAS</a>.</p>
<p>Entry requirements are listed on the course pages. As part of the process you will be required to provide evidence to support your application.&nbsp;Please see our <a href="/international/">international</a> webpages for more information for international students, including entry requirements and visa advice specific to your country.</p>
<p><a href="/courses/">Search for your course</a></p>
<h2>Applying for part-time study</h2>
<p>If you’d like to study part-time, you’ll need to apply online directly with us, rather than through UCAS. &nbsp;</p>
<p><strong>Click the 'apply now' button on the webpage for the course you’d like to study.</strong></p>
<h2>Already hold an undergraduate degree?</h2>
<p>If you already have a degree or higher qualification than that for which you are applying, your fee requirements may be different, due to the way government University funding is distributed. Please check the Equivalent or Lower Qualification (ELQ) policy&nbsp;for more details.<br><br>This also applies to students who progress to the third year of study, following completion of a Foundation Degree. Please note that Foundation Degrees are currently exempt from higher fees.</p>
    </div>
</div>"""
                ]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            ucascode = response.xpath(
                "//dd[contains(text(),'Course Code:')]//text()").extract()
            clear_space(ucascode)
            print("ucascode: ", ucascode)
            item['ucascode'] = ''.join(ucascode).replace("Course Code:",
                                                         "").strip()
            print("len: ", len(ucascode))
            print("item['ucascode'] = ", item['ucascode'])

            # duration
            durationMode = response.xpath(
                "//dt[contains(text(),'Course length')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(durationMode)
            print("durationMode: ", durationMode)
            durationMode = ''.join(durationMode)
            duration_list = getIntDuration(durationMode.strip())
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            print("item['duration']: ", item['duration'])
            print("item['duration_per']: ", item['duration_per'])
            item['other'] = durationMode
            print("item['other']: ", item['other'])

            if "or" in item['ucascode']:
                ucascode_list1 = item['ucascode'].split("or")
                print("ucascode_list1", ucascode_list1)

                # 拆分duration
                if ", or" in item['other']:
                    duration_list1 = item['other'].split(", or")
                else:
                    duration_list1 = [item['other'], item['other']]
                print("duration_list1: ", duration_list1)
                for u in range(len(ucascode_list1)):
                    item['ucascode'] = ucascode_list1[u].strip()
                    duration_list = getIntDuration(duration_list1[u].strip())
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                    print("item['duration']: ", item['duration'])
                    print("item['duration_per']: ", item['duration_per'])
                    # 分为两种情况，第一种正常采集，第二种为带实习的专业
                    if u == 0:
                        overview = response.xpath(
                            """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'Overview')]/.."""
                        ).extract()
                        if len(overview) == 0:
                            overview = response.xpath(
                                """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'overview')]/.."""
                            ).extract()
                        item['overview_en'] = remove_class(
                            clear_lianxu_space(overview))
                        # print("item['overview_en']1: ", item['overview_en'])

                        assessment_en = response.xpath(
                            """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'How will I be assessed?')]/..|
                            //h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'How will I be taught?')]/..|
                            //h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'Assessment')]/.."""
                        ).extract()
                        item['assessment_en'] = remove_class(
                            clear_lianxu_space(assessment_en))
                        # print("item['assessment_en']1: ", item['assessment_en'])
                    elif u == 1:
                        overview = response.xpath(
                            """//h2[contains(text(),"Professional placement year")]/following-sibling::div//h3[contains(text(),'Overview')]/.."""
                        ).extract()
                        if len(overview) == 0:
                            overview = response.xpath(
                                """//h2[contains(text(),"Professional placement year")]/following-sibling::div//h3[contains(text(),'overview')]/.."""
                            ).extract()
                            if len(overview) == 0:
                                overview = response.xpath(
                                    """//h3[contains(text(),'Overview')]/..|//h3[contains(text(),'overview')]/.."""
                                ).extract()
                        item['overview_en'] = remove_class(
                            clear_lianxu_space(overview))
                        # print("item['overview_en']2: ", item['overview_en'])

                        assessment_en = response.xpath(
                            "//h2[contains(text(),'Professional placement year')]/following-sibling::div//h3[contains(text(),'How will I be assessed?')]/..|"
                            "//h2[contains(text(),'Professional placement year')]/following-sibling::div//h3[contains(text(),'How will I be taught?')]/..|"
                            "//h2[contains(text(),'Professional placement year')]/following-sibling::div//h3[contains(text(),'Assessment')]/.."
                        ).extract()
                        if len(assessment_en) == 0:
                            assessment_en = response.xpath(
                                "//h3[contains(text(),'How will I be assessed?')]/..|"
                                "//h3[contains(text(),'How will I be taught?')]/..|"
                                "//h3[contains(text(),'Assessment')]/.."
                            ).extract()
                        item['assessment_en'] = remove_class(
                            clear_lianxu_space(assessment_en))
                        # print("item['assessment_en']2: ", item['assessment_en'])
                    yield item
            else:
                overview = response.xpath(
                    """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'Overview')]/.."""
                ).extract()
                if len(overview) == 0:
                    overview = response.xpath(
                        """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'overview')]/.."""
                    ).extract()
                item['overview_en'] = remove_class(
                    clear_lianxu_space(overview))
                # print("item['overview_en']1: ", item['overview_en'])

                assessment_en = response.xpath(
                    """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'How will I be assessed?')]/..|
                    //h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'How will I be taught?')]/..|
                    //h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'Assessment')]/.."""
                ).extract()
                item['assessment_en'] = remove_class(
                    clear_lianxu_space(assessment_en))
                # print("item['assessment_en']1: ", item['assessment_en'])
                yield item

        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)


#            department_dict = {"arts management":"Bath Business School","accounting and finance":"Bath Business School",
# "business and management":"Bath Business School",
# "business and management (accounting)":"Bath Business School",
# "business and management (entrepreneurship)":"Bath Business School",
# "business and management (international business)":"Bath Business School",
# "business and management (marketing)":"Bath Business School",
# "curatorial practice":"Bath School of Art and Design",
# "design (ceramics)":"Bath School of Art and Design",
# "design (fashion and textiles)":"Bath School of Art and Design",
# "fine art":"Bath School of Art and Design",
# "visual communication":"Bath School of Art and Design",
# "children's publishing":"College of Liberal Arts",
# "classical acting":"College of Liberal Arts",
# "composition":"College of Liberal Arts",
# "creative producing":"College of Liberal Arts",
# "creative writing":"College of Liberal Arts",
# "creative writing phd":"College of Liberal Arts",
# "crime and gothic fictions":"College of Liberal Arts",
# "dance":"College of Liberal Arts",
# "directing":"College of Liberal Arts",
# "directing circus":"College of Liberal Arts",
# "environmental humanities":"College of Liberal Arts",
# "environmental management":"College of Liberal Arts",
# "feature filmmaking":"College of Liberal Arts",
# "heritage management":"College of Liberal Arts",
# "intercultural musicology":"College of Liberal Arts",
# "liberal arts":"College of Liberal Arts",
# "literature, landscape and environment":"College of Liberal Arts",
# "music performance":"College of Liberal Arts",
# "performing shakespeare":"College of Liberal Arts",
# "principles of applied neuropsychology":"College of Liberal Arts",
# "scriptwriting":"College of Liberal Arts",
# "songwriting (campus based)":"College of Liberal Arts",
# "songwriting (distance learning)":"College of Liberal Arts",
# "sound (arts)":"College of Liberal Arts",
# "sound (design)":"College of Liberal Arts",
# "sound (production)":"College of Liberal Arts",
# "theatre for young audiences":"College of Liberal Arts",
# "transnational writing":"College of Liberal Arts",
# "travel and nature writing":"College of Liberal Arts",
# "writing for young people":"College of Liberal Arts",
# "counselling and psychotherapy practice":"Institute for Education",
# "education (education studies)":"Institute for Education",
# "education (early childhood studies)":"Institute for Education",
# "education (international education)":"Institute for Education",
# "education (leadership and management)":"Institute for Education",
# "inclusive education":"Institute for Education",
# "professional practice":"Institute for Education",
# "professional practice in higher education":"Institute for Education",
# "teaching english to speakers of other languages":"Institute for Education",
# "specific learning difficulties / dyslexia":"Institute for Education",
# "national award for special educational needs coordination":"Institute for Education",
# "professional doctorate in education":"Institute for Education",
# }

예제 #18

파일 보기

파일: UniversityOfChester_U.py 프로젝트: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Chester"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            programme = response.xpath(
                "//h1[@id='main-content']/text()").extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath(
                "//h1[@id='main-content']/div//text()").extract()
            item['degree_name'] = ''.join(degree_type).replace("(Hons)",
                                                               "").strip()
            print("item['degree_name']: ", item['degree_name'])

            start_date = response.xpath(
                "//select[@id='edit-date']/option//text()|//label[@for='edit-date']/following-sibling::span//text()"
            ).extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            start_date_str = ""
            if len(start_date) > 0:
                for s in start_date:
                    # start_date_str = getStartDate(s)
                    if getStartDate(s) != None:
                        start_date_str += getStartDate(s) + ", "
            item['start_date'] = start_date_str.strip().strip(',').strip()
            # print("item['start_date']: ", item['start_date'])

            mode = response.xpath(
                "//select[@id='edit-mode']//text()").extract()
            clear_space(mode)
            # item['teach_time'] = getTeachTime(''.join(mode))
            # print("mode: ", mode)

            location = response.xpath(
                "//label[@for='edit-compulsory']/following-sibling::*//text()"
            ).extract()
            item['location'] = ''.join(location).strip()
            # print("item['location']: ", item['location'])

            ucascode = response.xpath(
                "//dt[contains(text(),'UCAS Code')]/following-sibling::*//text()"
            ).extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            print("item['ucascode'] = ", item['ucascode'])

            duration = response.xpath(
                "//dt[contains(text(),'Duration')]/following-sibling::*//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            overview_en = response.xpath(
                "//h3[contains(text(),'Course overview')]/../*[position()<last()]|"
                "//div[@class='m-body__margin-bottom t-course__overview']"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # print("item['overview_en']: ", item['overview_en'])

            entry_requirements = response.xpath(
                "//div[@id='entry-international']//form[@id='courses-international-form']/preceding-sibling::*//text()"
            ).extract()
            # item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            alevel = response.xpath(
                "//td[contains(text(),'GCE A Level')]/following-sibling::*//text()"
            ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            # print("item['alevel']: ", item['alevel'])

            ib = response.xpath(
                "//td[contains(text(),'International Baccalaureate')]/following-sibling::*//text()"
            ).extract()
            item['ib'] = clear_lianxu_space(ib)
            # print("item['ib']: ", item['ib'])

            ielts_desc = response.xpath(
                "//div[@id='entry-international']//li[contains(text(),'Undergraduate:')]//text()"
            ).extract()
            clear_space(ielts_desc)
            item['ielts_desc'] = ''.join(ielts_desc).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            assessment_en = response.xpath(
                "//h3[@class='field-label'][contains(text(),'How will I be taught?')]/..|"
                "//h3[@class='field-label'][contains(text(),'How will I be assessed?')]/.."
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            tuition_fee = response.xpath(
                "//div[@class='field-fees-international']/p//text()|"
                "//p[contains(text(),'The tuition fees for international students studyi')]//text()"
            ).extract()
            print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']: ", item['tuition_fee'])
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            career_en = response.xpath(
                "//div[@id='careers-career-services']").extract()
            item['career_en'] = remove_class(
                clear_lianxu_space(career_en)).replace("<div></div>",
                                                       "").strip()
            print("item['career_en']: ", item['career_en'])

            modules = re.findall(
                r"function\sinit_drupal_core_settings\(\)\s{jQuery\.extend\(Drupal\.settings,.*}",
                response.text)
            # print("modules: ", modules)
            modules_str = ''.join(modules).replace(
                "function init_drupal_core_settings() {jQuery.extend(Drupal.settings,",
                "").strip()
            modules_dict = json.loads(modules_str)
            # print("modules_dict: ", modules_dict)
            # groupCode     modulesNid
            # print(modules_dict.get("courses"))
            if modules_dict.get('courses').get('groupCode') is not False:
                modules_json = "https://www1.chester.ac.uk/courses/modules/ajax/" + modules_dict.get(
                    'courses').get('modulesNid') + "/" + modules_dict.get(
                        'courses').get('groupCode') + "/389"
                # print("modules_json: ", modules_json)
                mdict = json.loads(requests.get(modules_json).text)
                # print("mdict: ", len(mdict))
                m = mdict[-1].get('data')
                if m != None:
                    item['modules_en'] = remove_class(clear_lianxu_space([m]))
            # print("item['modules_en']: ", item['modules_en'])

            item[
                'apply_proces_en'] = "https://www1.chester.ac.uk/undergraduate/how-apply/applying-full-time-courses"
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="field-collection-view clearfix view-mode-full">
  <h3 class="field-course-type">
    Undergraduate Study  </h3>

  <ul><li>UK foundation/pathway course with a pass mark of 50% and above.  Engineering courses require an additional mark of at least 55% in a Maths module. </li>
<li>China 3 year National Senior High School Certificate with 80% or above</li>
<li>Gaokao (College Entry Exam) with good grades </li>
<li>Dazhuan considered for entry to 3rd year UG</li>
<li>BFSU Foundation Year at 60% or above</li>
<li>Dongfang International Centre for Education Exchange Top University Foundation Course 60% or above</li>
<li>East and West International Education (EWIE)/ Wiseway Global International Foundation Certificate at 60% or above</li>
<li>Graduation Certificate from a specialised College/School (Zhongzuhan) with 80% or above</li>
</ul></div>"""
                ]))
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            # department = response.xpath("//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()").extract()
            # item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #19

파일 보기

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "St George's, University of London"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = "Cranmer Terrace, London SW17 0RE"
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            programmeDegree_name = response.xpath(
                "//div[@class='inner']/h1//text()").extract()
            programmeDegree_nameStr = ''.join(programmeDegree_name).strip()
            # print("programmeDegree_nameStr: ", programmeDegree_nameStr)

            if "Foundation" not in programmeDegree_nameStr:
                degree_name = re.findall(r"\(.*\)$|\w+\s\(.*\)$|\w+$",
                                         programmeDegree_nameStr)
                degree_name_str = ''.join(degree_name).strip()
                item['degree_name'] = degree_name_str.replace("(", "").replace(
                    ")", "").replace("Hons", "").strip()
                print("item['degree_name']: ", item['degree_name'])

                programme = programmeDegree_nameStr.replace(
                    degree_name_str, "").strip()
                item['programme_en'] = programme
                print("item['programme_en']: ", item['programme_en'])

                ucascode = response.xpath(
                    "//*[contains(text(),'UCAS code')]//text()").extract()
                clear_space(ucascode)
                # print("ucascode: ", ucascode)
                if len(ucascode) > 0:
                    ucascode_re = re.findall(r"UCAS\scode\s\w{4}",
                                             ''.join(ucascode))
                    # print("ucascode_re: ", ucascode_re)
                    item['ucascode'] = ''.join(ucascode_re).replace(
                        "UCAS code", "").strip()
                # print("item['ucascode'] = ", item['ucascode'])

                other = response.xpath(
                    "//img[@alt='globe']/../..//text()").extract()
                if len(other) == 0:
                    other = response.xpath(
                        "//td[contains(text(),'Open to UK and EU students. Not currently open to ')]//text()"
                    ).extract()
                item['other'] = clear_lianxu_space(other)
                # print("item['other'] = ", item['other'])

                # start_date = response.xpath("//dt[contains(text(), 'Start date')]/following-sibling::dd[1]//text()").extract()
                # clear_space(start_date)
                # # print("start_date: ", start_date)
                # item['start_date'] = getStartDate(''.join(start_date))
                # # print("item['start_date']: ", item['start_date'])

                duration = response.xpath(
                    "//img[@alt='Calendar']/../following-sibling::td//text()"
                ).extract()
                if len(duration) == 0:
                    duration = response.xpath(
                        "//img[@alt='Calendar']/../../following-sibling::td//text()"
                    ).extract()
                clear_space(duration)
                # print("duration: ", ''.join(duration))

                duration_list = getIntDuration(''.join(duration))
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
                # print("item['duration'] = ", item['duration'])
                # print("item['duration_per'] = ", item['duration_per'])

                # //p[contains(text(),'Non-UK/EU (International) application deadline')]
                deadline = response.xpath(
                    "//*[contains(text(),'Application deadline')]//text()|//*[contains(text(),'UCAS deadline')]//text()"
                ).extract()
                clear_space(deadline)
                # print("deadline: ", deadline)
                item['deadline'] = getStartDate(''.join(deadline).replace(
                    "Application deadline",
                    "").replace("is", "").replace("UCAS deadline",
                                                  "").replace(":", "").strip())
                if "2018" not in item['deadline'] and item[
                        'deadline'] != "" and "2019" not in item['deadline']:
                    item['deadline'] = ''.join(deadline).replace(
                        "Application deadline",
                        "").replace("is", "").replace("UCAS deadline",
                                                      "").replace(":",
                                                                  "").strip()
                # print("item['deadline']: ", item['deadline'])

                # location = response.xpath("//*[contains(text(),'Study location:')]//text()").extract()
                # item['location'] = ''.join(location).replace("Study location:", "").strip()
                # print("item['location']: ", item['location'])

                tuition_fee = response.xpath(
                    "//h3[contains(text(),'International (Non-EU) Student Fees')]/following-sibling::table//td[contains(text(),'2019/20')]/following-sibling::td[1]//text()|"
                    "//table//p[contains(text(),'2018 entry Non-EU')]//text()|"
                    "//table[2]/tbody/tr[4]/td/p[contains(text(),'2018 Non-EU')]/following-sibling::*/*[1]//text()|"
                    "//table//p[contains(text(),'2018 Non-EU')]/following-sibling::*[1]/*[1]//text()"
                ).extract()
                clear_space(tuition_fee)
                # print("tuition_fee: ", ''.join(tuition_fee))
                tuition_fee_re = re.findall(r"\d+,\d+", ''.join(tuition_fee))
                if len(tuition_fee_re) > 0:
                    item['tuition_fee'] = getTuition_fee(
                        ''.join(tuition_fee_re))
                # print("item['tuition_fee']: ", item['tuition_fee'])

                overview_en = response.xpath(
                    "//p[@class='first']|//table[1]/following-sibling::*[position()<last()-1]"
                ).extract()
                item['overview_en'] = remove_class(
                    clear_lianxu_space(overview_en)).replace(
                        "<p><button>Make an enquiry</button></p>", "").strip()
                # print("item['overview_en']: ", item['overview_en'])

                entry_url = response.xpath(
                    "//a[contains(text(),'Entry')]/@href").extract()
                # print("entry_url: ", entry_url)
                if len(entry_url) != 0:
                    parse_entry_url = "https://www.sgul.ac.uk" + entry_url[0]
                    # print("parse_entry_url: ", parse_entry_url)
                    entry_dict = self.parse_rntry_requirements(parse_entry_url)
                    # print(entry_dict)
                    # item['rntry_requirements'] = entry_dict.get('rntry_requirements')

                    item['ielts_desc'] = entry_dict.get('ielts_desc')
                    item['alevel'] = entry_dict.get('alevel')
                    item['ib'] = entry_dict.get('ib')
                # print("item['ielts_desc']: ", item['ielts_desc'])
                # print("item['alevel']: ", item['alevel'])
                # print("item['ib']: ", item['ib'])

                ielts_list = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
                if len(ielts_list) == 1:
                    item['ielts'] = ielts_list[0]
                    item['ielts_l'] = ielts_list[0]
                    item['ielts_s'] = ielts_list[0]
                    item['ielts_r'] = ielts_list[0]
                    item['ielts_w'] = ielts_list[0]
                elif len(ielts_list) == 2:
                    item['ielts'] = ielts_list[0]
                    item['ielts_l'] = ielts_list[1]
                    item['ielts_s'] = ielts_list[1]
                    item['ielts_r'] = ielts_list[1]
                    item['ielts_w'] = ielts_list[1]
                elif len(ielts_list) == 5:
                    item['ielts'] = ielts_list[0]
                    item['ielts_l'] = ielts_list[1]
                    item['ielts_s'] = ielts_list[4]
                    item['ielts_r'] = ielts_list[2]
                    item['ielts_w'] = ielts_list[3]
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                modules_url = response.xpath(
                    "//a[contains(text(),'Module')]/@href").extract()
                # print("modules_url: ", modules_url)
                if len(modules_url) != 0:
                    parse_modules_url = "https://www.sgul.ac.uk" + modules_url[
                        0]
                    # print("parse_modules_url: ", parse_modules_url)
                    item['modules_en'] = remove_class(
                        clear_lianxu_space(
                            self.parse_modules(parse_modules_url))).strip()
                # print("item['modules_en']: ", item['modules_en'])

                assessment_en_url = response.xpath(
                    "//a[contains(text(),'Studying')]/@href").extract()
                # print("assessment_en_url: ", assessment_en_url)
                if len(assessment_en_url) != 0:
                    parse_assessment_en_url = "https://www.sgul.ac.uk" + assessment_en_url[
                        0]
                    # print("parse_assessment_en_url: ", parse_assessment_en_url)
                    item['assessment_en'] = remove_class(
                        clear_lianxu_space(
                            self.parse_assessment_en(
                                parse_assessment_en_url))).strip()
                # print("item['assessment_en']: ", item['assessment_en'])

                career_en_url = response.xpath(
                    "//a[contains(text(),'Career')]/@href").extract()
                # print("career_en_url: ", career_en_url)
                if len(career_en_url) != 0:
                    parse_career_en_url = "https://www.sgul.ac.uk" + career_en_url[
                        0]
                    # print("parse_career_en_url: ", parse_career_en_url)
                    item['career_en'] = remove_class(
                        clear_lianxu_space(
                            self.parse_career_en(
                                parse_career_en_url))).replace(
                                    "<p><img></p>", "").strip()
                # print("item['career_en']: ", item['career_en'])

                apply_proces_en_url = response.xpath(
                    "//a[contains(text(),'Apply')]/@href|//a[contains(text(),'Application and interview')]/@href"
                ).extract()
                # print("apply_proces_en_url: ", apply_proces_en_url)
                if len(apply_proces_en_url) != 0:
                    parse_apply_proces_en_url = "https://www.sgul.ac.uk" + apply_proces_en_url[
                        0]
                    # print("parse_apply_proces_en_url: ", parse_apply_proces_en_url)
                    item['apply_proces_en'] = remove_class(
                        clear_lianxu_space(
                            self.parse_apply_proces_en(
                                parse_apply_proces_en_url))).replace(
                                    "<p><img></p>", "").strip()
                # print("item['apply_proces_en']: ", item['apply_proces_en'])

                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #20

파일 보기

파일: Queen'sUniversityBelfast_U.py 프로젝트: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Queen's University Belfast"
        # item['country'] = 'England'
        # item['website'] = 'http://www.qub.ac.uk/'
        item['url'] = response.url
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            degree_type = response.xpath(
                "//div[@class='columns aligned']//div[@class='column colspan-8']/h2//text()"
            ).extract()
            degree_type = ''.join(degree_type).split("|")
            print("degree_type: ", degree_type)
            if len(degree_type) != 0:
                item['degree_name'] = degree_type[0].strip()
            print("item['degree_name']: ", item['degree_name'])

            # 专业
            programme = response.xpath(
                "//div[@class='columns aligned']//div[@class='column colspan-8']/h1//text()"
            ).extract()
            clear_space(programme)
            item['programme_en'] = ''.join(programme).replace(
                item['degree_name'], '').strip()
            print("item['programme_en']: ", item['programme_en'])

            # start_date
            start_date = response.xpath(
                "//span[@class='cf-key-details key-entry-year']//text()"
            ).extract()
            clear_space(start_date)
            item['start_date'] = ''.join(start_date).strip()
            print("item['start_date']: ", item['start_date'])

            # duration
            duration = response.xpath(
                "//p[@class='cf-key-details-duration']//span[@class='cf-key-details']//text()"
            ).extract()
            clear_space(duration)
            print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            print("item['duration']-: ", item['duration'])
            print("item['duration_per']-: ", item['duration_per'])

            ucascode = response.xpath(
                "//span[@class='cf-key-details key-ucas-code']//text()"
            ).extract()
            clear_space(ucascode)
            if len(ucascode) > 0:
                item['ucascode'] = ''.join(ucascode[0]).strip()
            print("item['ucascode'] = ", item['ucascode'])

            # //div[@id='overview']
            overview = response.xpath("//div[@id='overview']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            # //div[@id='overview']
            modules = response.xpath(
                "//h3[@class='alt'][contains(text(),'Course Structure')]/following-sibling::table"
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            career = response.xpath(
                "//h3[@class='alt'][contains(text(),'Career Prospects')]/following-sibling::p[1]"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            # //a[@id='teaching']/following-sibling::*[position()<6]
            teaching_assessment = response.xpath(
                "//a[@id='teaching']/following-sibling::*[position()<6]"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(teaching_assessment).replace("\n", ""))
            # print("item['assessment_en']: ", item['assessment_en'])

            entry_requirements = response.xpath(
                "//div[@id='entry']//text()").extract()
            rntry_requirements = remove_class(
                clear_lianxu_space(entry_requirements))
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts = re.findall(r"IELTS.{1,150}", rntry_requirements)
            item['ielts_desc'] = ''.join(ielts)
            print("item['ielts_desc']: ", item['ielts_desc'])
            ieltsDict = get_ielts(''.join(ielts))
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            print(
                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            alevel = response.xpath(
                "//b[contains(text(),'Entry requirements:')]/following-sibling::span//text()"
            ).extract()
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[0]).strip()
            print("item['alevel'] = ", item['alevel'])

            # ib = response.xpath(
            #     "//html//div[@id='courseSummary']//tr/td[contains(text(), 'International Baccalaureate')]/following-sibling::td//text()").extract()
            # item['ib'] = ''.join(ib).strip()
            # print("item['ib'] = ", item['ib'])

            # //html//div[@id='fees']//tr[4]
            tuition_fee = response.xpath(
                "//html//div[@id='fees']//tr[4]//text()").extract()
            clear_space(tuition_fee)
            print("tuition_fee: ", tuition_fee)
            tuition_fee_str = ''.join(tuition_fee).strip().strip(
                "International")
            if "£" in tuition_fee_str:
                item['tuition_fee_pre'] = "£"
                item['tuition_fee'] = int(
                    tuition_fee_str.replace('£', '').replace(',', '').strip())
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])
            print("item['tuition_fee']: ", item['tuition_fee'])

            # //div[@class='panel bg--primary']//div[@class='inner']//p
            department = response.xpath(
                "//div[@class='panel bg--primary']//div[@class='inner']//p//text()"
            ).extract()
            clear_space(department)
            # print(department)
            for d in department:
                if "School" in d:
                    item['department'] = d.strip()
                elif "College" in d:
                    item['department'] = d.strip()
                elif "Campus" in d or d == "Biological Sciences" or d == "Marketing strategy" or d == "Management":
                    item['department'] = d.strip()
                elif len(d) == 4 or len(
                        d
                ) == 5 or d == "Arts, English and Languages" or d == "Global Food Security" or d == "Centre for Economic History":
                    item['department'] = d.strip()
            # print("item['department']: ", item['department'])

            department = response.xpath(
                "//html//div[@class='panel bg--grey-l']/div[@class='inner']//a//text()"
            ).extract()
            clear_space(department)
            item['department'] = ''.join(department).strip()
            print("item['department']: ", item['department'])

            # //html//div[@class='panel bg--grey-l']/div[@class='inner']/p[1]
            location = response.xpath(
                "//html//div[@class='panel bg--grey-l']/div[@class='inner']/text()"
            ).extract()
            clear_space(location)
            item['location'] = '\t'.join(location).strip()
            print("item['location']: ", item['location'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h1 class="alt"><a name="UG"></a>Undergraduate entry requirements</h1>
<p>The following qualifications will be considered for direct entry to undergraduate programmes:</p>
<ul>
<li>Students who have completed 12 years of education in China and attained the Secondary School Leaving Certificate with good grades must complete an approved Foundation programme or GCE A Levels for progression to undergraduate degree programmes.</li>
<li>The 'Gaokao' Chinese University Entrance Examination will be considered, along with performance in the Senior High School examination for entry to Stage 1 of our undergraduate programmes.</li>
<li>Progression to Stage 1 of an undergraduate degree programme at Queen's&nbsp;(with the exception of Agricultural Technology, Medicine, Dentistry and Social Work) is guaranteed for students who successfully complete the <a title="University%20Preparation%20Courses" href="/home/International/International-students/Applying/University-Preparation-Courses/">INTO Queen's International Foundation Programme</a> at the required standard.</li>
<li>Students who have completed one or two years of university study in China may be eligible for admission to Bachelor degree programmes, if relevant subjects have been studied and strong grades have been achieved.</li>
<li>Applicants who have already completed A-Levels/a recognised Foundation programme or the first year of a relevant degree programme in China, but who do not meet the academic or English language requirements for entry, may wish to consider <a title="University%20Preparation%20Courses" href="/home/International/International-students/Applying/University-Preparation-Courses/">INTO Queen's International Year One</a>. Successful completion at the required standard offers direct entry to the second year of selected undergraduate degree programmes in Management, Economics, Finance and Engineering.</li>
<li>Between 30 and 36 points in the International Baccalaureate Diploma (IB). <a href="/home/International/International-students/Your-Country/InternationalBaccalaureateIBDiplomaEntryRequirements/">Information on required grades</a>.</li>
</ul>
<p><strong>Please note: </strong>Grades required vary depending on the programme of study. Further guidance on the entry requirements for each degree programme can be found in the Undergraduate Coursefinder.</p>"""
                ]))

            apply_proces_en = response.xpath("//div[@id='apply']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(apply_proces_en))
            print("item['apply_proces_en']: ", item['apply_proces_en'])
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'w',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #21

파일 보기

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Bristol"
        # items['country'] = "England"
        # items["website"] = "https://www.bristol.ac.uk/"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            # 专业
            course = response.xpath(
                "//span[@property='programname']//text()").extract()
            # print("course = ", course)
            item['programme_en'] = ''.join(course).replace("\n", " ").replace(
                "\r", " ").strip()
            print("item['programme_en']: ", item['programme_en'])

            # degreeaward
            degreeaward = response.xpath(
                "//span[@property='award']//text()").extract()
            # print("degreeaward = ", degreeaward)
            item['degree_name'] = clear_space_str(''.join(degreeaward))
            print("item['degree_name']: ", item['degree_name'])

            ucascode = response.xpath(
                "//th[contains(text(),'UCAS code')]/../td//text()").extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            # print("item['ucascode']: ", item['ucascode'])
            if item['ucascode'] == "":
                ucascode = response.xpath(
                    "//th[contains(text(),'Application method')]/following-sibling::td//text()"
                ).extract()
                clear_space(ucascode)
                # print("ucascode1: ", ucascode)
                item['ucascode'] = ''.join(ucascode).replace(
                    "Entry by transfer from", "").replace(
                        "Entry by transfer after two years from",
                        "").replace("Entry by transfer after year one of",
                                    "").replace("at the end of year one",
                                                "").strip()
            # print("item['ucascode']1: ", item['ucascode'])

            # duration
            duration = response.xpath(
                "//th[contains(text(),'Course duration')]/../td//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)

            # duration_list = getIntDuration(''.join(duration))
            if len(duration) == 2:
                item['duration'] = int(duration[0])
                if 'y' in ''.join(duration).lower():
                    item['duration_per'] = 1
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            # location
            location = response.xpath(
                "//th[contains(text(),'Location of course')]/../td//text()"
            ).extract()
            item['location'] = clear_space_str(''.join(location))
            # print("item['location']: ", item['location'])

            # startdate
            startdate = response.xpath(
                "//p[@class='year-of-entry']/text()").extract()
            clear_space(startdate)
            # print("startdate = ", startdate)
            if len(startdate) > 0:
                item['start_date'] = ''.join(startdate).replace("entry",
                                                                "").strip()
            # print("item['start_date'] = ", item['start_date'])

            tuitionFee = response.xpath(
                "//li[contains(text(),'International students: £')]//text()"
            ).extract()
            # print("tuitionFee = ", tuitionFee)
            if len(tuitionFee) > 0:
                item['tuition_fee_pre'] = "£"
                item['tuition_fee'] = getTuition_fee(''.join(tuitionFee))
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])
            # print("item['tuition_fee']: ", item['tuition_fee'])

            # deadline
            # deadline = response.xpath("//div[@id='apply']/div[@class='apply-deadline']/p[1]//text()").extract()
            # # print("deadline = ", deadline)
            # item['deadline'] = getStartDate(''.join(deadline))
            # # print("item['deadline']: ", item['deadline'])

            # department
            department = response.xpath(
                "//div[@id='contact']/p[@class='pg-contact-address']/text()"
            ).extract()
            clear_space(department)
            print("department1 = ", department)
            for d in department:
                if "School" in d or "Faculty" in d:
                    item['department'] = d
            # print("item['department']: ", item['department'])
            if item['department'] == "":
                allcontent = response.xpath(
                    "//main[@class='content']//text()").extract()
                clear_space(allcontent)
                department_re = re.findall(r"School\sof.{1,30}",
                                           ''.join(allcontent), re.I)
                # print("department_re: ", department_re)
                if len(department_re) > 0:
                    item['department'] = department_re[0].strip()
            # print("item['department']1: ", item['department'])

            overview = response.xpath(
                "//div[@id='course-description']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            assessment_en = response.xpath(
                "//div[@id='teaching']|//div[@id='assessment']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            print("item['assessment_en']: ", item['assessment_en'])

            alevel = response.xpath(
                "//div[@id='typical-offer']//table//tr/th[contains(text(), 'A-level')]/../td//text()"
            ).extract()
            clear_space(alevel)
            # print(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[0]).strip()
            # print("item['alevel'] = ", item['alevel'])

            # print(len("36 points overall with 18 at Higher Level, including 6, 5 at Higher Level in two of the following subjects: Biology, Chemistry, Physics, Mathematics, Psychology"))
            if len(item['alevel']) > 160:
                item['alevel'] = ''.join(item['alevel'][:161])
            # print("item['alevel']1 = ", item['alevel'])

            ib = response.xpath(
                "//div[@id='typical-offer']//table//tr/th[contains(text(), 'International Baccalaureate ')]/../td//text()"
            ).extract()
            clear_space(ib)
            if len(ib) > 0:
                item['ib'] = ''.join(ib[0]).strip()

            if len(item['ib']) > 160:
                item['ib'] = ''.join(item['ib'][:161])
            # print("item['ib'] = ", item['ib'])

            # 课程结构
            modulesUrl = response.xpath(
                "//div[@id='course-structure']//div[@class='collapsible']//a/@href"
            ).extract()
            # print("modulesUrl: ", modulesUrl)
            modulesUrl = ''.join(modulesUrl).strip()
            if len(modulesUrl) != 0:
                item['modules_en'] = self.parse_modules_en(modulesUrl)[0]
                # print("item['modules_en']: ", item['modules_en'])
                u = self.parse_modules_en(modulesUrl)[1]
                # print(u)
                while len(u) != 0:
                    u1 = "https://www.bris.ac.uk" + ''.join(u)
                    # print("u1=", u1)
                    item['modules_en'] += self.parse_modules_en(u1)[0]
                    u = self.parse_modules_en(u1)[1]
            # print("item['modules_en']1: ", item['modules_en'])

            # 学术要求本科特殊专业要求、IELTS
            entryRequirements = response.xpath(
                "//div[@id='typical-offer']//text()").extract()
            # item['rntry_requirements'] = clear_lianxu_space(entryRequirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts = response.xpath(
                "//*[contains(text(),'Profile A')]//text()|//*[contains(text(),'Profile B')]//text()|"
                "//*[contains(text(),'Profile C')]//text()|//*[contains(text(),'Profile D')]//text()|"
                "//*[contains(text(),'Profile E')]//text()|//*[contains(text(),'Profile F')]//text()"
            ).extract()
            item['ielts_desc'] = clear_lianxu_space(ielts)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            if item['ielts_desc'] == "Profile A":
                item['ielts'] = 7.5
                item['ielts_l'] = 7.0
                item['ielts_s'] = 7.0
                item['ielts_r'] = 7.0
                item['ielts_w'] = 7.0
                item['toefl'] = 109
                item['toefl_l'] = 25
                item['toefl_r'] = 25
                item['toefl_s'] = 25
                item['toefl_w'] = 29
            elif item['ielts_desc'] == "Profile B":
                item['ielts'] = 7.0
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.5
                item['ielts_w'] = 7.0
                item['toefl'] = 100
                item['toefl_l'] = 24
                item['toefl_r'] = 24
                item['toefl_s'] = 24
                item['toefl_w'] = 27
            elif item['ielts_desc'] == "Profile C":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.5
                item['ielts_w'] = 6.5
                item['toefl'] = 92
                item['toefl_l'] = 23
                item['toefl_r'] = 23
                item['toefl_s'] = 23
                item['toefl_w'] = 24
            elif item['ielts_desc'] == "Profile D":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.0
                item['ielts_s'] = 6.0
                item['ielts_r'] = 7.0
                item['ielts_w'] = 7.0
                item['toefl'] = 92
                item['toefl_l'] = 21
                item['toefl_r'] = 21
                item['toefl_s'] = 21
                item['toefl_w'] = 27
            elif item['ielts_desc'] == "Profile E":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.0
                item['ielts_s'] = 6.0
                item['ielts_r'] = 6.0
                item['ielts_w'] = 6.0
                item['toefl'] = 90
                item['toefl_l'] = 20
                item['toefl_r'] = 20
                item['toefl_s'] = 20
                item['toefl_w'] = 20
            elif item['ielts_desc'] == "Profile F":
                item['ielts'] = 6.0
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.0
                item['ielts_w'] = 6.0
                item['toefl'] = 86
                item['toefl_l'] = 20
                item['toefl_r'] = 20
                item['toefl_s'] = 20
                item['toefl_w'] = 23
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #       item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            # assessment_en, career_en
            assessCareerUrlSplit = response.url.rsplit('/')
            assessCareerUrl = response.url.replace(
                assessCareerUrlSplit[-2] + "/", "").strip()
            print(assessCareerUrl)
            assessCareerDict = self.parseAssessCareer(assessCareerUrl)

            item['assessment_en'] = assessCareerDict.get(
                'assessment_en').strip()
            print("item['assessment_en']: ", item['assessment_en'])

            item['career_en'] = assessCareerDict.get('career_en').strip()
            print("item['career_en']: ", item['career_en'])

            # 申请要求
            apply_desc_en = response.xpath(
                "//div[@id='typical-offer']").extract()
            item['apply_desc_en'] = remove_class(
                clear_lianxu_space(apply_desc_en))
            # print("item['apply_desc_en']: ", item['apply_desc_en'])

            require_chinese_en = """<h2 id="ugentryreqs">Entry requirements for undergraduate courses</h2>
<p>You can apply for undergraduate programmes either through the&nbsp;<a href="http://www.ucas.com/">Universities and Colleges Admissions Service</a>&nbsp;(UCAS) or&nbsp;<a href="http://www.commonapp.org/">The Common Application.</a>&nbsp;Please use only&nbsp;<strong>one</strong>&nbsp;method of applying. If you are using UCAS to apply for other UK universities, please also make your University of Bristol application through UCAS and do not use the Common Application.The UCAS code name and number for this University is BRISL B78.</p>
<p>Individual course entry requirements&nbsp;are listed in our <a href="http://www.bris.ac.uk/study/undergraduate/">Undergraduate Prospectus</a>&nbsp;for each course.</p>
<ul>
<li>Applicants with the Gaozhong Biye Zhengshu (Senior High School Certificate) and Gaokao&nbsp;(Chinese University entrance exam) combined with a successfully completed appropriate <a href="http://www.bris.ac.uk/english-language/study/ifp/" target="_blank">Foundation programme</a> will be considered for admission to our Bachelor's degree courses.</li>
<li>Applicants who have successfully completed the first year of a Chinese University degree at a prestigious university will be considered for admission to the first year of our Bachelor's degree courses.</li>
<li>Applicants will be required to meet the English language requirements for the programme. The profile level requirements can be found on the&nbsp;<a href="http://www.bristol.ac.uk/study/language-requirements/">English language requirements for study</a>&nbsp;page.</li>
</ul>"""
            item["require_chinese_en"] = remove_class(require_chinese_en)
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<h2>How to apply</h2><p>You can apply through Universities and Colleges Admissions Service (UCAS) or the Common Application (Common App). For Engineering Design, Medicine, Dentistry or Veterinary Science courses, you must apply using UCAS.</p> </div> <!-- end: content - how to apply --> <!-- start: drop down - application options --> <div class="main-col-child">  <div class="dropdown"> <h3 class="dropdown-heading">Applying through UCAS</h3> <div class="dropdown-content"> <p>You can apply for a maximum of five courses using the UCAS form. Apply for medicine, dentistry and veterinary courses through UCAS by 15 October. You can only use four of your five UCAS choices to apply to these courses.</p> <p><a class="btn icon-arrow-right" data-tracking-click-url="http://uk.sitestat.com/bristol/bristol-ext/s?study.undergraduate.apply.international.index_html.international-ucas&amp;ns_type=clickout&amp;ns_url=https://www.ucas.com/ucas/undergraduate/getting-started/ucas-undergraduate-international-and-eu-students" href="https://www.ucas.com/ucas/undergraduate/getting-started/ucas-undergraduate-international-and-eu-students">Apply online through UCAS</a><br />Our UCAS institution code is <strong>BRISL B78</strong>.</p> <p>After you have applied, UCAS will give you a ten-digit personal ID number. You will need this if you contact the University about your application.</p> <h4>Entering your qualifications</h4> <p>Before you submit your UCAS application, make sure you have included:</p> <ul> <li><strong>Full details of qualifications you have already taken</strong>: include grades/marks for the academic qualifications you've achieved from age 16 (GCSE or equivalent), and any English language qualifications.</li> <li><strong>Full details of the qualifications you are taking:</strong> include current studies (name and expected date of examination and major subjects), English language qualifications, and any resits of previous qualifications you expect to take.</li> </ul> <p>If your qualification offers different levels of study, state which subjects you are studying at the higher level, and which at the standard level.</p> <p>Watch the <a href="https://www.ucas.com/connect/videos?v=/apply-education-page">UCAS how-to guide on entering qualifications</a>.</p> <h4>When to apply</h4> <p>Find the <a href="https://www.ucas.com/ucas/undergraduate/apply-track/when-apply"><span>application deadlines on the UCAS website</span></a>.</p> </div>   <h3 class="dropdown-heading" >Applying through the Common App</h3> <div class="dropdown-content"> <p>You can use Common App to apply for any full-time undergraduate course at Bristol, except Engineering Design, Medicine, Dentistry or Veterinary Science courses. The deadline for applying through Common App is 30 June 2018.</p> <p><a class="btn icon-arrow-right" data-tracking-click-url="http://uk.sitestat.com/bristol/bristol-ext/s?study.undergraduate.apply.international.index_html.international-common&amp;ns_type=clickout&amp;ns_url=https://www.commonapp.org/" href="https://www.commonapp.org/">Apply online through the Common App</a></p> <p>After you have applied, you will be given an application number. You will need this if you contact the University about your application.</p> </div>   <h3 class="dropdown-heading" >Applying for direct entry courses</h3> <div class="dropdown-content"> <p>These are our direct entry courses. Please apply using these links and not through UCAS:</p> <ul> <li><a href="/dental/courses/dcp/hygiene/apply/">Diploma in Dental Hygiene</a></li> <li><a href="http://www.bristol.ac.uk/english-language/study/ifp/apply/">International Foundation Programme</a></li> <li><a href="/arts/study/foundation/apply/">Foundation in Arts and Humanities</a></li></ul> """
                ]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])
            yield item
        except Exception as e:
            print("异常：", str(e))
            print("报错链接：", response.url)
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")

예제 #22

파일 보기

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Bolton"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        print("subjectArea===: ", response.meta['subjectArea'])
        try:
            programmeDegreetype = response.xpath(
                "//div[@class='wpb_text_column wpb_content_element  vc_custom_1506499626241']/div[@class='wpb_wrapper']/h2//text()"
            ).extract()
            # print("programmeDegreetype: ", programmeDegreetype)
            programmeDegreetypeStr = ''.join(programmeDegreetype).strip()

            degree_type = response.xpath(
                "//li[@class='iconim award']//b[contains(text(),'Award:')]/..//text()"
            ).extract()
            # print("degree_type: ", degree_type)
            item['degree_name'] = ''.join(degree_type).replace(
                "Award:", "").replace("(Hons)", "").strip()
            # if item['degree_name'] == "":
            #     item['degree_name'] = "**"
            print("item['degree_name']: ", item['degree_name'])

            # if item['degree_name'].lower() == "phd":
            #     item['teach_type'] = 'phd'
            #     item['degree_type'] = 3
            # print("item['teach_type']: ", item['teach_type'])
            # print("item['degree_type']: ", item['degree_type'])

            programme = programmeDegreetypeStr.replace(
                item['degree_name'], '').replace("(Hons)",
                                                 "").replace("()", "").strip()
            item['programme_en'] = programme
            print("item['programme_en']: ", item['programme_en'])

            start_date = response.xpath(
                "//li[@class='iconim date']//b[contains(text(),'Start date:')]/..//text()"
            ).extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            start_date_str = ''.join(start_date).replace("Start date:",
                                                         "").strip()
            # print("start_date_str: ", start_date_str)
            start_date_re = re.findall(r"\d+/\d+/\d+", start_date_str)
            # print("start_date_re: ", start_date_re)
            if len(start_date_re) > 0:
                for s in start_date_re:
                    start_date_sp = s.split('/')
                    item['start_date'] += start_date_sp[
                        -1] + "-" + start_date_sp[1] + "-" + start_date_sp[
                            0] + ", "
            if item['start_date'] != None:
                item['start_date'] = item['start_date'].strip().rstrip(
                    ',').strip()
            # print("item['start_date']: ", item['start_date'])

            location = response.xpath(
                "//li[@class='iconim location']//b[contains(text(),'Location:')]/..//text()"
            ).extract()
            item['location'] = ''.join(location).replace("Location:",
                                                         "").strip()
            # print("item['location']: ", item['location'])

            duration = response.xpath(
                "//li[@class='iconim duration']//b[contains(text(),'Duration:')]/..//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            overview_en = response.xpath(
                "//div[@id='course-details']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # print("item['overview_en']: ", item['overview_en'])

            # //div[@id='course-detail']
            entry_requirements = response.xpath(
                "//div[@id='entry-requirements']//text()").extract()
            # item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts_desc = response.xpath(
                "//div[@id='entry-requirements']//*[contains(text(),'IELTS')]/text()"
            ).extract()
            clear_space(ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            # ielts_desc_re = re.findall(r'.{1,50}IELTS.{1,50}', ''.join(ielts_desc))
            # print("ielts_desc_re: ", ielts_desc_re)
            # if len(ielts_desc) > 0:
            item['ielts_desc'] = ''.join(ielts_desc).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            career_en = response.xpath(
                "//div[@id='careers-employment']").extract()
            item['career_en'] = remove_class(
                clear_lianxu_space(career_en)).replace("<div></div>",
                                                       "").strip()
            # print("item['career_en']: ", item['career_en'])

            how_to_apply = response.xpath(
                "//div[@id='how-to-apply']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            modules = response.xpath(
                "//div[@class='tab_content modules_tab_content tab__teaching-assessment__modules']"
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath(
                "//div[@class='tab_content modules_tab_content tab__teaching-assessment__teaching-methods']"
                "|//div[@class='tab_content modules_tab_content tab__teaching-assessment__assessment-methods']"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            tuition_fee = response.xpath(
                "//h3[@class='table_header'][contains(text(),'International fees')]/following-sibling::div[1]/table//tr/th[contains(text(),'2018/')][1]/following-sibling::td[1]//text()"
            ).extract()
            print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            department_dict = {
                "Art & Design and Fine Art":
                "Bolton School of the Arts",
                "Textiles & Fashion":
                "Bolton School of the Arts",
                "Media & Photography":
                "Bolton School of the Arts",
                "Theatre & Performance":
                "Bolton School of the Arts",
                "English & Creative Writing":
                "Bolton School of the Arts",
                "Graphic Design":
                "Bolton School of the Arts",
                "Animation & Illustration":
                "Bolton School of the Arts",
                "Accountancy":
                "Institute of Management Greater Manchester",
                "Business, Retail, Logistics & Supply Chain Management":
                "Institute of Management Greater Manchester",
                "Nursing":
                "Faculty of Health & Wellbeing",
                "Health & Social Care":
                "Faculty of Health & Wellbeing",
                "Dental Sciences":
                "Faculty of Health & Wellbeing",
                "Early Years & Childhood Studies":
                "Faculty of Health & Wellbeing",
                "Community Work & Youth":
                "Faculty of Health & Wellbeing",
                "School of Sport & Biological Sciences":
                "Faculty of Health & Wellbeing",
                "Automotive Design":
                "National Centre for Motorsport Engineering",
                "Chassis Dynamics & Aerodynamics":
                "National Centre for Motorsport Engineering",
                "General Engineering":
                "National Centre for Motorsport Engineering",
                "Motorsport & Trackside Technology":
                "National Centre for Motorsport Engineering",
                "Engines & Performance Modelling":
                "National Centre for Motorsport Engineering",
                "Our Partners":
                "National Centre for Motorsport Engineering",
                "Computing":
                "School of Creative Technologies",
                "Games":
                "School of Creative Technologies",
                "Special & Visual Effects":
                "School of Creative Technologies",
                "Education & Teacher Training":
                "School of Education & Psychology",
                "Psychology":
                "School of Education & Psychology",
                "Access courses":
                "School of Education & Psychology",
                "International Foundation programmes & English Pre-Sessional courses":
                "School of Education & Psychology",
                "Construction":
                "School of Engineering",
                "Civil Engineering":
                "School of Engineering",
                "Mechanical Engineering":
                "School of Engineering",
                "Motorsport & Automotive Performance Engineering":
                "School of Engineering",
                "Biomedical & Medical Engineering":
                "School of Engineering",
                "Electrical & Electronic Engineering":
                "School of Engineering",
                "Mathematics":
                "School of Engineering",
                "Law":
                "School of Law",
                "Centre for Contemporary Coronial Law":
                "School of Law",
                "Medical Biology":
                "School of Sport & Biological Sciences",
                "Sports & Sport Rehabilitation":
                "School of Sport & Biological Sciences",
            }
            item['department'] = department_dict.get(
                response.meta['subjectArea'])
            print("item['department']: ", item['department'])

            alevel = response.xpath(
                "//li[@class='iconim points']//b[contains(text(),'UCAS points:')]/../span//text()"
            ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel']: ", item['alevel'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<p><strong>Undergraduate entry to year 1 </strong></p>
<p>The above qualifications and completion of a suitable foundation programme.</p>
<p>Alternatively, successful completion of one year at a recognised Chinese university in a relevant subject.</p>
<p><strong>Undergraduate entry to year 2 / 3</strong></p>
<p>2 Year Diploma in a suitable subject area.</p>
<p>University College Graduation Diploma or Graduation Diploma from recognised institutions.</p>
<p>EDEXCEL or SQA HND</p>
<p>Da Zhuan (3 Year Diploma)</p>
<p>(Year 2 &amp; 3 entry is subject to successful programme mapping)</p>"""
                ]))

            ucascode = response.xpath(
                "//li[@class='iconim code']//b[contains(text(),'UCAS code:')]/..//text()"
            ).extract()
            clear_space(ucascode)
            print("ucascode: ", ucascode)
            item['ucascode'] = ''.join(ucascode).replace("UCAS code:",
                                                         "").strip()
            print("item['ucascode'] = ", item['ucascode'])

            mode = response.xpath(
                "//b[contains(text(),'Course type:')]/..//text()").extract()
            clear_space(mode)
            teach_time = ''.join(mode)
            print("teach_time: ", teach_time)

            isup = response.xpath(
                "//a[contains(text(),'Click here for more information on')]//text()"
            ).extract()
            # print("isup: ", isup)
            isup_str = ''.join(isup)
            if len(isup) == 0:
                isup = response.xpath(
                    "//li[@class='iconim code']//b[contains(text(),'UCAS code:')]/..//text()"
                    "|//li[@class='iconim points']//b[contains(text(),'UCAS points:')]/..//text()"
                ).extract()
            print("isup_str: ", isup_str)
            print("isup: ", isup)
            if "full" in teach_time.lower():
                if "https://courses.bolton.ac.uk/course" in item['url']:
                    if "undergraduate" in isup_str or len(
                            item['ucascode']) != 0:
                        print("******存到数据库*****")
                        yield item

        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #23

파일 보기

파일: UniversityOfLincoln_U.py 프로젝트: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.lincoln.ac.uk/"
        item['university'] = "University of Lincoln"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = 'University of Lincoln, Brayford Pool, Lincoln, LN6 7TS'
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            if "Foundation" not in item['major_type1']:
                # //table[@id='newTitle']/tbody[@id='newTitleBody']/tr/td/h1[1]/a
                programmeDegreetype = response.xpath("//div[@id='CourseTitleApms']/h1//text()").extract()
                clear_space(programmeDegreetype)
                # print("programmeDegreetype: ", programmeDegreetype)
                if len(programmeDegreetype) > 0:
                    programmeDegreetypeStr = programmeDegreetype[0].strip()

                degree_type = re.findall(r"^\w+\s\(Hons\)|^\(\w+\)|^\w+", programmeDegreetypeStr)
                # print("degree_type: ", degree_type)
                degree_type_str = ''.join(degree_type).strip()
                item['degree_name'] = ''.join(degree_type).replace("(Hons)", "").replace("(", "").replace(")", "").strip()
                print("item['degree_name']: ", item['degree_name'])

                item['programme_en'] = programmeDegreetypeStr.replace(degree_type_str, '').strip()
                print("item['programme_en']: ", item['programme_en'])

                ucascode = response.xpath("//div[@class='nd_2019-20']//span[@class='blue'][contains(text(),'UCAS Code:')]/..//text()").extract()
                if len(ucascode) == 0:
                    ucascode = response.xpath("//span[@class='blue'][contains(text(),'UCAS Code:')]/..//text()").extract()
                clear_space(ucascode)
                # print("ucascode: ", ucascode)
                item['ucascode'] = ''.join(ucascode).replace("UCAS Code:", "").strip()
                # print("item['ucascode'] = ", item['ucascode'])

                # //span[@id='durationFT']
                duration = response.xpath("//div[@class='nd_2019-20']//span[contains(text(),'Full-time Duration')]/..//text()").extract()
                if len(duration) == 0:
                    duration = response.xpath("//span[contains(text(),'Full-time Duration')]/..//text()").extract()
                clear_space(duration)
                # print("duration: ", duration)
                duration_str = ''.join(duration)

                duration_list = getIntDuration(duration_str)
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
                # print("item['duration'] = ", item['duration'])
                # print("item['duration_per'] = ", item['duration_per'])

                department = response.xpath("//span[contains(text(),'School:')]/following-sibling::a//text()").extract()
                clear_space(department)
                if len(department) > 0:
                    item['department'] = department[0]
                # print("item['department']: ", item['department'])

                dep_dict = {"lincoln school of architecture and the built environment": "College of Arts",
    "lincoln school of design": "College of Arts",
    "lincoln school of film and media": "College of Arts",
    "school of english and journalism": "College of Arts",
    "school of fine and performing arts": "College of Arts",
    "school of history and heritage": "College of Arts",
    "school of chemistry": "College of Science",
    "school of computer science": "College of Science",
    "school of engineering": "College of Science",
    "school of geography": "College of Science",
    "school of life sciences": "College of Science",
    "school of mathematics and physics": "College of Science",
    "school of pharmacy": "College of Science",
    "national centre for food manufacturing": "College of Science",
    "lincoln institute for agri-tech": "College of Science",
    "school of education": "College of Social Science",
    "school of health and social care": "College of Social Science",
    "professional development centre": "College of Social Science",
    "lincoln law school": "College of Social Science",
    "school of psychology": "College of Social Science",
    "school of social and political sciences": "College of Social Science",
    "school of sport and exercise science": "College of Social Science",}
                if item['department'] != "Lincoln Business School":
                    item['department'] = dep_dict.get(item['department'].lower())
                # print("item['department']1: ", item['department'])

                if item['department'] == None:
                    item['department'] = ''.join(response.xpath("//div[@class='breadcrumb-list']//span//a[@href='/home/collegeofsocialscience/']//text()").extract()).strip()
                # print("item['department']2: ", item['department'])

                # //div[@id='feesTables']/table
                fee = response.xpath("//div[@class='nd_2019-20']//div[@class='panel-body']//table[2]//td[contains(text(),'Full-time')]/following-sibling::*[last()]//text()").extract()
                if len(fee) == 0:
                    fee = response.xpath(
                        "//div[@class='panel-body']//table[2]//td[contains(text(),'Full-time')]/following-sibling::*[last()]//text()").extract()
                clear_space(fee)
                # print("fee: ", fee)
                feeStr = ''.join(fee)
                tuitionfee = getTuition_fee(feeStr)
                item['tuition_fee'] = tuitionfee
                if item['tuition_fee'] == 0:
                    item['tuition_fee'] = None
                # print("item['tuition_fee']: ", item['tuition_fee'])

                # //h2[contains(text(),'The Course')]/..
                overview = response.xpath("//h2[contains(text(),'The Course')]/..").extract()
                # print("overview: ", overview)
                if len(overview) > 0:
                    item['overview_en'] = remove_class(clear_lianxu_space([overview[-1]]))
                # print("item['overview_en']: ", item['overview_en'])

                modules_en = response.xpath("//a[contains(text(),'Modules')]/../../..").extract()
                modules_en = response.xpath(
                    "//div[@id='collapse62019-20']//div[@class='tab-content clearfix']").extract()

                if len(modules_en) > 0:
                    item['modules_en'] = remove_class(clear_lianxu_space([modules_en[-1]]))
                if item['modules_en'] == "":
                    item['modules_en'] = None
                    print("*** modules_en")
                else:
                    print("===", item['modules_en'])
                    del_cont = re.findall(r"<br>Find out more</p><div><span>.*?</em></span>", item['modules_en'])
                    print("del_cont==", del_cont)
                    if len(del_cont) > 0:
                        for delc in del_cont:
                            item['modules_en'] = item['modules_en'].replace(delc, '<div>').strip()
                print("item['modules_en']: ", item['modules_en'])

                assessment_en = response.xpath(
                    "//a[contains(text(),'How You Are Assessed')]/../../..|//a[contains(text(),'How you are assessed')]/../../..").extract()
                if len(assessment_en) > 0:
                    item['assessment_en'] = remove_class(clear_lianxu_space([assessment_en[-1]]))
                # print("item['assessment_en']: ", item['assessment_en'])

                interview_desc_en = response.xpath(
                    "//a[contains(text(),'Interviews & Applicant Days')]/../../..").extract()
                if len(interview_desc_en) > 0:
                    item['interview_desc_en'] = remove_class(clear_lianxu_space([interview_desc_en[-1]]))
                # print("item['interview_desc_en']: ", item['interview_desc_en'])

                alevel = response.xpath(
                    "//*[contains(text(),'GCE Advanced Levels')]/text()|//*[contains(text(),'A Level')]/text()").extract()
                if len(alevel) > 0:
                    item['alevel'] = clear_lianxu_space([alevel[-1]])
                print("item['alevel']: ", item['alevel'])

                ib = response.xpath(
                    "//p[contains(text(),'International Baccalaureate')]").extract()
                if len(ib) > 0:
                    item['ib'] = remove_tags(clear_lianxu_space([ib[-1]]))
                # print("item['ib']: ", item['ib'])

                rntry_requirements = response.xpath(
                    "//a[contains(text(),'Entry Requirements')]/../../..|//a[contains(text(),'Entry requirements')]/../../..").extract()
                if len(rntry_requirements) > 0:
                    rntry_requirements = remove_tags(clear_lianxu_space([rntry_requirements[-1]]))
                # print("rntry_requirements: ", rntry_requirements)

                ielts = re.findall(r"IELTS.{1,80}", rntry_requirements)
                item['ielts_desc'] = ''.join(ielts).strip()
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ielts_dict = get_ielts(item['ielts_desc'])
                item['ielts'] = ielts_dict.get('IELTS')
                item['ielts_l'] = ielts_dict.get('IELTS_L')
                item['ielts_s'] = ielts_dict.get('IELTS_S')
                item['ielts_r'] = ielts_dict.get('IELTS_R')
                item['ielts_w'] = ielts_dict.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                career = response.xpath("//div[@id='CourseCareersApms']").extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # print("item['career_en']: ", item['career_en'])

                # if item['ielts_desc'] == "":
                #     item['ielts_desc'] = "Prospective students require IELTS 6.0 (with no less than 5.5 in each band score) or an equivalent qualification. Please note that some courses require a higher score."
                #     item['ielts'] = 6.0
                #     item['ielts_l'] = 5.5
                #     item['ielts_s'] = 5.5
                #     item['ielts_r'] = 5.5
                #     item['ielts_w'] = 5.5
                # print("******item['ielts_desc']: ", item['ielts_desc'])
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                # http://www.lincoln.ac.uk/home/studywithus/internationalstudents/englishlanguagerequirementsandsupport/englishlanguagerequirements/
                if item['ielts'] == "6.5":
                    item['toefl'] = 90
                    item['toefl_l'] = 20
                    item['toefl_s'] = 22
                    item['toefl_r'] = 21
                    item['toefl_w'] = 22
                elif item['ielts'] == "7.0":
                    item['toefl'] = 100
                    item['toefl_l'] = 22
                    item['toefl_s'] = 23
                    item['toefl_r'] = 23
                    item['toefl_w'] = 23
                # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
                #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

                # http://www.lincoln.ac.uk/home/studywithus/internationalstudents/entryrequirementsandyourcountry/china/
                item["require_chinese_en"] = remove_class(clear_lianxu_space(["""<div class="panel">
<div class="panel-heading">
<h4 class="panel-title">
<a data-toggle="collapse" href="#countryUndergraduateTab">
<em class="more-less glyphicon glyphicon-menu-down"></em>Undergraduate Entry
</a>
</h4>
</div>
<div id="countryUndergraduateTab" class="panel-collapse collapse">
<div class="panel-body">
<p>Prospective students require one of the following qualifications for entry into year one of an undergraduate degree:</p>
<ul>
<li>Successful completion of a Foundation programme with a minimum of 50% plus an average of 70% or above in High School. Please note that some programmes may require a higher foundation score e.g. 60%.</li>
<li>Successful completion of the first year of a Chinese degree / Diploma with an average grade of 70% or above.</li>
</ul>
<p><strong>&nbsp;</strong></p>
<p><strong>HND Students (BTEC and SQA)</strong></p>
<p>Students who have successfully completed a HND BTEC or SQA qualification may be accepted directly into year two or three of a University of Lincoln undergraduate course on a case by case basis.</p>
<p><strong>Chinese Degree / Diploma</strong></p>
<p>Students who have successfully completed the second or third year of a Chinese Degree or Diploma may be considered for direct entry into year two or three of a University of Lincoln undergraduate course on a case by case basis. For more information, please contact the International Admissions team:&nbsp;<a href="mailto:intadmissions&#64;lincoln&#46;ac&#46;uk">intadmissions&#64;lincoln&#46;ac&#46;uk</a>.</p>
<p>&nbsp;</p>	
<!-- START ADVANCED ENTRY (UNDERGRADUATE) -->
<p><strong>Advanced Entry (Undergraduate)</strong></p>
<p>Depending on your academic background and intended course of study, it may be possible to apply for advanced entry into year 2 or 3 of a University of Lincoln undergraduate course.</p>

<!-- START COUNTRY SPECIFIC ADVANCED ENTRY (UNDERGRADUATE) -->


<!-- END COUNTRY SPECIFIC ADVANCED ENTRY (UNDERGRADUATE) -->

<p id="advEntryUgEu">For more information, please contact the Student Administration Team: <a href="mailto:[email protected]">[email protected]</a>.</p>
<p id="advEntryUgInternational">For more information, please contact the International Admissions Team: <a href="mailto:[email protected]">[email protected]</a>.</p>
<!-- END ADVANCED ENTRY (UNDERGRADUATE) -->
</div>
</div>					
</div>
"""]))
                # print("item['require_chinese_en']: ", item['require_chinese_en'])

                item['apply_proces_en'] = "http://www.lincoln.ac.uk/home/studywithus/undergraduatestudy/howtoapply/"
                # print("item['apply_proces_en']: ", item['apply_proces_en'])
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #24

파일 보기

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Surrey"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item[
            'location'] = '01SE01, Senate House, University of Surrey, Guildford, Surrey GU2 7XH'
        # print("item['location'] = ", item['location'])
        print("===============================")
        print(response.url)
        try:
            overview = response.xpath(
                "//h3[contains(text(),'Course facts')]/../preceding-sibling::*"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en'] = ", item['overview_en'])

            career = response.xpath(
                "//h2[contains(text(),'Careers')]/preceding-sibling::*[1]/following-sibling::*[position()<last()-3]"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # if item['career_en'] == "":
            #     print("***career_en")
            # print("item['career_en'] = ", item['career_en'])

            modules = response.xpath(
                "//div[@class='module-list']/following-sibling::*[1]/preceding-sibling::*"
            ).extract()
            # modules1 = response.xpath("//div[@id='modules-ft']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # if item['modules_en'] == "":
            #     print("***modules_en")
            # print("item['modules_en'] = ", item['modules_en'])

            assessment_en = response.xpath(
                "//h2[contains(text(),'Teaching')]/preceding-sibling::*[1]/following-sibling::*[position()<7]|"
                "//h2[contains(text(),'Assessment')]/preceding-sibling::*[1]/following-sibling::*[position()<3]"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # if item['assessment_en'] == "":
            #     print("***assessment_en")
            # print("item['assessment_en'] = ", item['assessment_en'])

            # //a[contains(text(), 'Faculty of')]|//a[contains(text(), 'School of')]
            department = response.xpath(
                "//a[contains(text(), 'Faculty of')]//text()|"
                "//a[contains(text(), 'School of')]//text()").extract()
            item['department'] = remove_class(
                clear_lianxu_space(department)).replace(
                    "academic staff in the", "").strip()
            # if item['department'] == "":
            #     print("***department")
            # print("item['department'] = ", item['department'])

            entry_requirements = response.xpath(
                "//div[@id='entry-collapse']").extract()
            item['apply_desc_en'] = remove_class(
                clear_lianxu_space(entry_requirements))
            # print("item['apply_desc_en'] = ", item['apply_desc_en'])

            alevel = response.xpath(
                "//h3[contains(text(),'A-level')]/following-sibling::*[1]//text()"
            ).extract()
            alevel_str = ''.join(alevel).strip()
            if alevel_str == "Overall:" or alevel_str == "Overall":
                alevel = response.xpath(
                    "//h3[contains(text(),'A-level')]/following-sibling::*[position()<4]//text()"
                ).extract()
                alevel_str = ''.join(alevel).replace(
                    "Overall", "").strip().strip(":").strip()
                # print("***alevel")
            item['alevel'] = clear_space_str(alevel_str)
            # print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//h3[contains(text(),'International Baccalaureate')]/following-sibling::*[1]//text()"
            ).extract()
            ib_str = ''.join(ib).strip()
            if ib_str == "Overall:":
                ib = response.xpath(
                    "//h3[contains(text(),'International Baccalaureate')]/following-sibling::*[2]//text()"
                ).extract()
                ib_str = ''.join(ib).strip()
                # print("***ib")
            item['ib'] = ib_str
            # print("item['ib'] = ", item['ib'])

            ielts_str = response.xpath(
                "//div[@id='entry-collapse']//h2[contains(text(),'English')]/following-sibling::p[position()<4]//text()"
            ).extract()
            ielts_re = re.findall(r"^IELTS.{1,80}", ''.join(ielts_str))
            item['ielts_desc'] = ''.join(ielts_re).strip()
            # print("item['ielts_desc'] = ", item['ielts_desc'])

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['IELTS'] = %sitem['IELTS_L'] = %sitem['IELTS_S'] = %sitem['IELTS_R'] = %sitem['IELTS_W'] = %s==" % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            application_open_date = response.xpath(
                "//div[@class='p-3 p-xl-4 text-center text-light']//text()"
            ).extract()
            clear_space(application_open_date)
            # print("application_open_date: ", ''.join(application_open_date))
            item['application_open_date'] = getStartDate(
                ''.join(application_open_date))
            # if item['application_open_date'] == "":
            #     print("***application_open_date")
            # print("item['application_open_date'] = ", item['application_open_date'])

            tuition_fee = response.xpath(
                "//div[@id='fees']//tbody//tr[1]/td[last()-1]//text()"
            ).extract()
            # print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))

            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
            else:
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee'] = ", item['tuition_fee'])
            print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """ <h2>Process</h2>
<ol><li>Choose the programmes you want to study. Still undecided? Search our <a href="/undergraduate">undergraduate degrees</a></li>
<li>Find out <a href="/apply/undergraduate/how-to-apply-through-ucas">how to apply through UCAS</a></li>
<li>Wait for universities to make their decisions, <a href="/apply/undergraduate/after-you-apply">learn what happens after you apply</a></li>
<li>Reply to your <a href="/apply/undergraduate/your-offer">university offers</a></li>
<li><a href="/apply/undergraduate/your-offer">Confirm your university place</a></li>
</ol>"""
                ]))
            # print("item['apply_proces_en'] = ", item['apply_proces_en'])

            # https://www.surrey.ac.uk/china/entry-requirements
            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h2>Undergraduate</h2>
<p>We do not accept the Chinese National University Entrance Examination. However, you can apply to study for an <a href="http://isc.surrey.ac.uk/programmes/international-foundation-year?ch=uniweb&amp;cc=uniweb&amp;cid=uniweb&amp;utm_source=signposting&amp;utm_medium=signposting&amp;utm_campaign=uniweb&amp;_ga=2.246594701.825790074.1509959240-87246970.1500115796">International Foundation Year</a> at our <a href="http://isc.surrey.ac.uk/">International Study Centre</a>, which will prepare you for a full undergraduate degree course.</p>"""
                ]))

            # 专业、学位类型
            programme_en = response.xpath(
                "//h1[@class='text-center my-0']//text()").extract()
            programme_en_str = (''.join(programme_en).split("–"))[0].strip()
            print(programme_en_str)

            if "2019" in ''.join(programme_en):
                item['start_date'] = '2019'
            # print("item['start_date'] = ", item['start_date'])

            # 判断可以拆分几条数据，ucascode、duration、degree_name
            is_degree_name = response.xpath(
                "//tbody[@class='w-100']/tr").extract()
            # print("is_degree_name: ", is_degree_name)
            print(len(is_degree_name))
            for i in range(len(is_degree_name)):
                print("****************" + str(i + 1) + "***************")
                degree_name_re = re.findall(r"\w+\s\(Hons\).*|\w+$",
                                            programme_en_str)
                if len(degree_name_re) > 0:
                    item['degree_name'] = ''.join(degree_name_re).strip()
                    item['programme_en'] = programme_en_str.replace(
                        item['degree_name'], '').strip()
                else:
                    item['programme_en'] = programme_en_str
                print("item['programme_en'] = ", item['programme_en'])

                degree_name_xpath = response.xpath(
                    "//tbody[@class='w-100']//tr[" + str(i + 1) +
                    "]/td[1]//text()").extract()
                clear_space(degree_name_xpath)
                item['degree_name'] = ''.join(degree_name_xpath).strip()
                print("item['degree_name'] = ", item['degree_name'])

                duration = response.xpath("//tbody[@class='w-100']//tr[" +
                                          str(i + 1) +
                                          "]/td[2]//text()").extract()
                clear_space(duration)
                # print("duration: ", duration)
                if len(duration) != 0:
                    duration_list = getIntDuration(''.join(duration))
                    # print("duration_list: ", duration_list)
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                print("item['duration'] = ", item['duration'])
                print("item['duration_per'] = ", item['duration_per'])

                ucascode = response.xpath("//tbody[@class='w-100']//tr[" +
                                          str(i + 1) +
                                          "]/td[4]//text()").extract()
                clear_space(ucascode)
                item['ucascode'] = ''.join(ucascode).strip()
                print("item['ucascode']: ", item['ucascode'])

                tick = response.xpath(
                    "//tbody[@class='w-100']//tr[" + str(i + 1) +
                    "]/td[3]//i[@class='icon icon-tick']").extract()
                clear_space(tick)
                print("tick: ", tick)
                print(len(tick))
                if len(tick) == 1:
                    item['other'] = 'Professional Training'
                print("item['other']: ", item['other'])
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #25

파일 보기

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Reading"
        # item['country'] = 'England'
        # item['website'] = 'http://www.reading.ac.uk/'
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = "Whiteknights,PO Box 217,Reading, Berkshire,RG6 6AH"
        print("===========================")
        print(response.url)
        try:
            # 专业、学位类型、ucas_code
            programmeDegree_typeUcascode = response.xpath(
                "//span[@class='text-bg-standout text-nice-wrap']/text() | //h1[@id='heading']//text() | //h1[@class='hero-heading']//text() | //h1[@class='block-heading block-heading-l5 block-heading-b5 block-heading-md-l-reset cell-md-t0']//text()"
            ).extract()
            clear_space(programmeDegree_typeUcascode)
            programmeDegree_typeUcascode = ''.join(
                programmeDegree_typeUcascode).strip()
            # print("programmeDegree_typeUcascode: ", programmeDegree_typeUcascode)

            degree_type = re.findall(r"^\w+/\w+", programmeDegree_typeUcascode)
            if len(degree_type) == 0:
                degree_type = re.findall(r"^\w+", programmeDegree_typeUcascode)
            # print("degree_type: ", degree_type)
            item['degree_name'] = ''.join(degree_type)
            print("item['degree_name']: ", item['degree_name'])

            ucascode = re.findall(r"\w{4}$", programmeDegree_typeUcascode)
            item['ucascode'] = ''.join(ucascode).strip()
            # print("item['ucascode']: ", item['ucascode'])

            programme = programmeDegree_typeUcascode.replace(
                item['degree_name'], '').replace(item['ucascode'], "").strip()
            item['programme_en'] = programme.title()
            print("item['programme_en']: ", item['programme_en'])

            # duration
            durationMode = response.xpath(
                "//h2[@class='row-margin-small text-weight-medium text-size-25']/text() | //strong[contains(text(),'Duration')]/../text() | //h3[contains(text(),'Programme length:')]/following-sibling::p[1]//text()"
            ).extract()
            clear_space(durationMode)
            # print("durationMode: ", durationMode)
            durationMode = ''.join(durationMode)
            duration_list = getIntDuration(''.join(durationMode))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            # start_date = response.xpath("//p[@class='headline'][contains(text(), 'Start date')]//text()").extract()
            # # print(start_date)
            # item['start_date'] = getStartDate(''.join(start_date))
            # # print("item['start_date']: ", item['start_date'])

            overview2 = response.xpath(
                "//div[@class='m-bg-white m-pad-around m-pull-left-normal m-pull-up']//div[@class='theme-editor'] | //div[@id='top-courseOverview'] | //html//div[@id='top-programmeOverview']/h2[1]/following-sibling::div[1] | //div[@id='tc1']"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview2))
            # if item['overview_en'] == "":
            #     print("***overview_en")
            # print("item['overview_en']: ", item['overview_en'])

            # department
            item['department'] = response.meta['department']
            # print("item['department']: ", item['department'])

            if item['department'] == "":
                department = response.xpath(
                    "//aside[contains(@class,'pane base4 m-margin-bottom')]//div[contains(@class,'row-small')]//p[contains(text(), 'School')]/following-sibling::*//text()"
                ).extract()
                clear_space(department)
                item['department'] = ''.join(department).strip()
                item['department'] = item['department'].replace(
                    "How to apply",
                    "").replace("Visit the",
                                "").replace("website",
                                            "").strip().strip('.').strip()
                # print("item['department']1: ", item['department'])
            # if item['department'] == "":
            #     print("***department")

            # //h2[@id='Panel1Trigger']/../..
            entry_requirements = response.xpath(
                "//span[contains(text(),'entry requirements')]/../../.."
            ).extract()
            entry = ''.join(entry_requirements).strip()
            item['apply_desc_en'] = remove_class(
                clear_lianxu_space(entry_requirements))
            # if item['apply_desc_en'] == "":
            #     print("apply_desc_en 为空")
            # print("item['apply_desc_en']: ", item['apply_desc_en'])

            alevel = response.xpath(
                "//h4[contains(text(),'Typical')]/following-sibling::*[1]//text()|//h4[contains(text(),'A level')]/following-sibling::*[1]//text()"
            ).extract()
            item['alevel'] = ''.join(alevel).strip()
            # if item['alevel'] == "":
            #     print("alevel 为空")
            # print("item['alevel']: ", item['alevel'])

            ib = response.xpath(
                "//h4[contains(text(),'International Baccalaureate')]/following-sibling::*[1]//text()"
            ).extract()
            item['ib'] = ''.join(ib).strip()
            # if item['ib'] == "":
            #     print("ib 为空")
            # print("item['ib']: ", item['ib'])

            ielts = re.findall(r"IELT.{1,100}", entry)
            ielts = response.xpath(
                "//*[contains(text(),'IELT')]//text()").extract()
            if ''.join(ielts).strip() == "IELTS":
                ielts = response.xpath(
                    "//*[contains(text(),'IELT')]/following-sibling::*[1]//text()"
                ).extract()
            clear_space(ielts)
            item['ielts_desc'] = ''.join(ielts).strip()
            # if item['ielts_desc'] == "":
            #     print("ielts_desc 为空")
            # print("item['ielts_desc']: ", item['ielts_desc'])
            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            # toefl = re.findall(r"TOEFL[\s\(\)\w:\.]{1,300}", entry)
            # if item['toefl_desc'] == "":
            #     item['toefl_desc'] = ''.join(toefl)
            # print("item['toefl_desc']: ", item['toefl_desc'])
            # toeflDict = get_toefl(item['toefl_desc'])
            # item['toefl'] = toeflDict.get("TOEFL")
            # item['toefl_l'] = toeflDict.get("TOEFL_L")
            # item['toefl_s'] = toeflDict.get("TOEFL_S")
            # item['toefl_r'] = toeflDict.get("TOEFL_R")
            # item['toefl_w'] = toeflDict.get("TOEFL_W")
            # # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            modules = response.xpath(
                "//h2[@id='Panel2Trigger']/../..|//div[@id='bottom-courseContent']/..|//div[@id='page_content_wrap']/following-sibling::div[position()<3]|//strong[contains(text(),'Programme structure')]/../following-sibling::*"
            ).extract()
            if len(modules) == 0:
                modules = response.xpath(
                    "//h4[contains(text(),'Programme structure and content')]/preceding-sibling::*[1]/following-sibling::*[position()<11]"
                ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            # //h2[@id='Panel1Trigger']/../..
            career = response.xpath(
                "//h2[@id='Panel4Trigger']/../following-sibling::div[1]|//div[@id='bottom-careers']/..|//div[@id='careers']|//h3[contains(text(),'Careers')]/.."
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            # //h3[@class='row-margin-small text-weight-medium'][contains(text(),'How much will it cost?')]/following-sibling::p[2]
            tuition_fee = response.xpath(
                "//p[contains(text(),'New international students')]//text()"
            ).extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"£\d+,\d+|£\d+", ''.join(tuition_fee))
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = int(''.join(tuition_fee_re).replace(
                    "£", "").replace(",", "").strip())

            if item['tuition_fee'] == 0:
                item[tuition_fee] = None
            else:
                item['tuition_fee_pre'] = "£"
            # if item['tuition_fee'] is None:
            #     print("tuition_fee 为空")
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //div[@id='top-howWeTeachYou']
            assessment_en = response.xpath(
                "//div[@id='top-howWeTeachYou']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="row row-margin-small row-margin-title-10">

                        <h1 class="text-transform-uppercase text-size-30 m-text-size-25 text-weight-medium display-inline text-bg-standout text-nice-wrap">
                            <span class="text-bg-standout">How to apply for undergraduate courses</span>
                        </h1>

                    </div>
                                    <div class="theme-editor theme-editor-break-word">
                        You can apply online for all of our courses via the national admissions service, <a href="http://www.ucas.com">UCAS</a>. You can choose to apply for up to five courses in total, including more than one course at the same institution. <br />
<h4>When to apply&nbsp;</h4>
<p>UK or EU students: You should aim to apply via UCAS between 1 September and 15 January for admission in September 2018. If you have missed the 15 January deadline, there is still the opportunity to apply (via UCAS), and we are happy to consider late applications until 30 June 2018 (all applications received after 30 June are entered into Clearing). Please be aware that some of our courses may be full after the UCAS deadline, so we do recommend early applications where possible.</p>
<p>
International students: You should aim to apply via UCAS between 1 September 2017 and 15 January 2018 for admission in September 2018, though applying before 15 January is encouraged in order to ensure you have time to prepare for studying in the UK. However, if you have missed the 15 January deadline, you are still welcome to apply (via UCAS), and we are happy to consider late applications until 30 June 2018 (all applications received after 30 June are entered into Clearing). Please be aware that some of our courses may be full after the UCAS deadline, so we do recommend early applications where possible.</p>
<h4>UCAS code</h4>
<p>Our UCAS code is R12. The University does not have a campus code.&nbsp;</p>
<h4>UCAS costs</h4>
<p>There is a small charge made by UCAS for applying to university. The application fee is &pound;13 if you&rsquo;re applying to just one course, or &pound;24 for multiple courses and for late applications sent after 30 June.</p>
<h4>Entry requirements</h4>
<p>Please read our <a href="/ready-to-study/study/how-to-apply/entry-requirements-ug.aspx">entry requirements page</a> for more information on accepted qualifications.</p>
<h4>English language requirements</h4>
<p>If English is not your first language, you can find out more information on our <a href="/ready-to-study/international-and-eu/english-language-requirements.aspx">English language requirements</a> page.</p>
                    </div>
                                    <div class="row-large paddingtop-small pad-sides border-top-light">
                        <div class="visuallyhidden" id="show-more-094422b2-b9da-4602-9594-80e05dba925c" aria-hidden="true">
                            <div class="theme-editor">
                                <h4>The application process&nbsp;
</h4>
<p>Once UCAS receives your application, it sends it to our Admissions Office, who assess it and decide whether to offer you a place. The way we assess your application will differ from course to course, but we will use the information supplied in your application form including your personal statement, predicted and achieved grades and the reference supplied by your school or college.&nbsp;</p>
<p>We carefully consider every application so please don’t worry if you don’t hear back from us straight away. We aim to make a decision on all applications within four weeks, and you will be able to track the progress of your application on <a href="https://www.ucas.com/ucas/undergraduate/login">UCAS Track</a>.&nbsp;</p>
<p>We will email you with the outcome of your application and confirm this with UCAS so that you can see the decision online using UCAS Track. If we offer you a place, we will explain any conditions attached to that offer (for example, the need to achieve certain grades in your examinations).&nbsp;</p>
<h4>Interviews</h4>
<p> For some courses, we invite prospective students for an interview before making an offer. These are:&nbsp;</p>
<ul><li>Accounting and Business (assessment centre run in conjunction with PwC)&nbsp;</li>
    <li>Archaeology&nbsp;</li>
    <li>Art</li>
    <li>Chemistry&nbsp;</li>
    <li>Film, Theatre &amp; Television&nbsp;</li>
    <li>Food and Nutritional Sciences&nbsp;</li>
    <li>Graphic Communication&nbsp;</li>
    <li>Pharmacy&nbsp;</li>
    <li>Primary Education&nbsp;</li>
    <li>Psychology (MSci courses)&nbsp;</li>
    <li>Meteorology and Climate (MMet course)&nbsp;</li>
    <li>Theatre Arts, Education and Deaf Studies (TAEDS)&nbsp;</li>
</ul>
<h4>Visit Days</h4>
<p> If you are offered a place to study at the University of Reading without an interview, we will invite you to attend a Visit Day in your department of choice. Visit Days take place between November and March and will usually include a tour of our campus and facilities, a visit to a hall of residence, and the chance to meet academic staff and current students.&nbsp;</p>
<h4>Choosing offers&nbsp;</h4>
<p>Once you have heard from all of the universities that you applied to, UCAS will ask you which offer you want to accept. Most people choose two: one as your ‘firm’ or first choice, the other as your ‘insurance’ or second choice. If you meet the conditions of your offer, you will automatically be accepted onto your firm choice course.&nbsp;</p>
<h4>Confirmation of your place&nbsp;</h4>
<p>Most offers are conditional on exam results. If you meet the conditions set out in our offer, your place is assured and you will see this on <a href="https://www.ucas.com/ucas/undergraduate/login">UCAS Track</a> . If you do not meet the conditions set out in your offer, you may still be able to get on the course. We will let you know as soon as possible after we have received your results.&nbsp;</p>
<h4>Gap year/deferred entry&nbsp;</h4>
<p>We welcome deferred entry applications. You need to apply at the same time as if you were planning to go straight to university, but you should state in your UCAS application that you wish to be considered for deferred admission.</p>
                            </div></div></div>"""
                ]))
            print("item['apply_proces_en']: ", item['apply_proces_en'])

            item['interview_desc_en'] = remove_class(
                clear_lianxu_space([
                    """<h4>Interviews</h4>
<p> For some courses, we invite prospective students for an interview before making an offer. These are:&nbsp;</p>
<ul><li>Accounting and Business (assessment centre run in conjunction with PwC)&nbsp;</li>
    <li>Archaeology&nbsp;</li>
    <li>Art</li>
    <li>Chemistry&nbsp;</li>
    <li>Film, Theatre &amp; Television&nbsp;</li>
    <li>Food and Nutritional Sciences&nbsp;</li>
    <li>Graphic Communication&nbsp;</li>
    <li>Pharmacy&nbsp;</li>
    <li>Primary Education&nbsp;</li>
    <li>Psychology (MSci courses)&nbsp;</li>
    <li>Meteorology and Climate (MMet course)&nbsp;</li>
    <li>Theatre Arts, Education and Deaf Studies (TAEDS)&nbsp;</li>
</ul>"""
                ]))
            print("item['interview_desc_en']: ", item['interview_desc_en'])

            item["require_chinese_en"] = remove_class(
                clear_lianxu_space([
                    """<h2 class="trigger">Entry requirements</h2>
<table summary="A table outlining the basic entry requirements for courses at the University of Reading based on the qualifications offered in your country">
<tbody>
<tr><!-- HEADINGS-->
<td class="top-head"><strong>Your highest qualification</strong></td>
<td class="top-head"><strong>Likely entry level</strong></td></tr>
<tr><!-- FIRST ROW -->
<td>
<p><!-- EG FIRST ROW FIRST COLUMN INFO -->High School year 2 (Year 11) with leaving certificate: GPA 85%<br/>High School year 3 (Year 12) with graduation certificate: GPA 80%</p></td>
<td><a href="http://www.reading.ac.uk/foundation" name="ifp" >International Foundation Programme</a> </td></tr>
<tr class="even"><!-- SECOND ROW -->
<td>Gao Kao (Chinese University Entrance Exam) 80%</td>
<td><a href="http://www.reading.ac.uk/foundation" name="ifp" >International Foundation Programme</a> </td></tr>
<tr><!-- THIRD ROW -->
<td>Gau Cau (Chinese University Entrance Exam) combined with a successfully completed appropriate foundation/bridging programme. (Visit our <a href="http://www.reading.ac.uk/foundation" name="ifp" >International Foundation Programme</a>) </td>
<td>Undergraduate Degree (Bachelors Degree) </td></tr>
<tr class="even"><!-- FOURTH ROW -->
<td>International Baccalaureate (IB) Diploma </td>
<td>Undergraduate Degree (Bachelors Degree) </td></tr>
<tr><!-- FIFTH ROW -->
<td>British/International A Levels </td>
<td>Undergraduate Degree (Bachelors Degree) </td></tr>
<tr class="even"><!-- SIXTH ROW -->
<td>Chinese-medium A Levels in Mathematics and Sciences (Cambridge Examinations Board) </td>
<td>Undergraduate Degree (Bachelors Degree) in a relevant subject </td></tr>
<tr>
<td>Ameson: High school results of 85% if 11 years completed, 80% if 12 years (with similar grades in relevant subjects), AST Maths: 165 and AST English: 150</td>
<td>Undergraduate Degree (Bachelors Degree)</td></tr>
<tr class="even"><!-- EIGHTH ROW -->
<td>Other international qualifications such as Australian HSC, US SAT or AP Certificates</td>
<td>Undergraduate Degree (Bachelors Degree) </td></tr>
<tr><!-- NINTH ROW -->
<td>Successfully completed first year of a Chinese University degree </td>
<td>Undergraduate Degree (Bachelors Degree) </td></tr>
<tr class="even"><!-- TENTH ROW -->
<td>4-year Bachelor degree </td>
<td>Taught Postgraduate (Masters and Doctoral Degree) </td></tr>
<tr>
<td>&nbsp;Masters degree study </td>
<td>&nbsp;Research Postgraduate (Doctoral Degree) </td></tr></tbody></table>"""
                ]))
            print("item['require_chinese_en']: ", item['require_chinese_en'])
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #26

파일 보기

파일: UniversityOfChichester_U.py 프로젝트: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Chichester"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item[
            'location'] = 'University of Chichester, College Lane, Chichester, West Sussex, PO19 6PE'
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            department = response.xpath(
                "//div[@class='breadcrumb']//a[2]//text()").extract()
            item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            programmeDegreetype = response.xpath(
                "//div[@class='field-items accordion-content']/h2//text()"
            ).extract()
            # print("programmeDegreetype: ", programmeDegreetype)
            programmeDegreetypeStr = ''
            if len(programmeDegreetype) > 0:
                programmeDegreetypeStr = programmeDegreetype[0].strip()
            # print("programmeDegreetypeStr: ", programmeDegreetypeStr)

            degree_type = re.findall(
                r"^\w+/\w+|^\w+.*/\s\w+|^\w+\s\(Hons\)|^\w+/\w+\s\(Hons\)|^\w+",
                programmeDegreetypeStr, re.I)
            degree_name_str = ''.join(degree_type).strip()
            item['degree_name'] = degree_name_str.replace(
                "(Hons)", "").replace("(HONS)", "").strip()
            print("item['degree_name']: ", item['degree_name'])

            programme = programmeDegreetypeStr.replace(degree_name_str, '')
            item['programme_en'] = ''.join(programme).replace(
                "(Hons)", "").title().strip().strip('-').strip()
            print("item['programme_en']: ", item['programme_en'])

            ucascode = response.xpath(
                "//p[contains(text(),'UCAS ')]//text()").extract()
            clear_space(ucascode)
            # print("ucascode: ", ucascode)
            if len(ucascode) > 0:
                item['ucascode'] = ''.join(ucascode[0]).replace("UCAS",
                                                                "").strip()
            print("item['ucascode'] = ", item['ucascode'])

            alevel = response.xpath(
                "//*[contains(text(),'A levels')]//text()").extract()
            item['alevel'] = clear_lianxu_space(alevel)
            # print("item['alevel']: ", item['alevel'])

            ib = response.xpath(
                "//*[contains(text(),'International Baccalaureate')]//text()"
            ).extract()
            item['ib'] = clear_lianxu_space(ib)
            # print("item['ib']: ", item['ib'])

            entry_requirements = response.xpath(
                "//section//div[@class='field field-name-field-main-content field-type-text-long field-label-hidden']//div[@class='field-items accordion-content']//h2[contains(text(), 'Entry')]/..//text()|"
                "//section//div[@class='field field-name-field-main-content field-type-text-long field-label-hidden']//div[@class='field-items accordion-content']//h2[contains(text(), 'ENTRY')]/..//text()"
            ).extract()
            # print("==", entry_requirements)
            rntry_requirements = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts_desc = response.xpath(
                "//*[contains(text(),'IELTS')]/text()").extract()
            clear_space(ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            item['ielts_desc'] = ''.join(ielts_desc)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            overview_en = response.xpath(
                "//span[contains(text(),'Course content')]/../..").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # print("item['overview_en']: ", item['overview_en'])

            career_en = response.xpath(
                "//span[contains(text(),'Where this can take you')]/../.."
            ).extract()
            item['career_en'] = remove_class(
                clear_lianxu_space(career_en)).replace("<div></div>",
                                                       "").strip()
            # print("item['career_en']: ", item['career_en'])

            modules = response.xpath(
                "//span[contains(text(),'Indicative modules')]/../..").extract(
                )
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath(
                "//span[contains(text(),'Teaching and assessment')]/../.."
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            # interview_desc_en = response.xpath("//span[contains(text(),'Teaching and assessment')]/../..").extract()
            # item['interview_desc_en'] = remove_class(clear_lianxu_space(interview_desc_en))
            # print("item['interview_desc_en']: ", item['interview_desc_en'])

            duration_url = response.xpath(
                "//iframe[@id='unistats-widget-frame']/@src").extract()
            clear_space(duration_url)
            print("duration_url: ", duration_url)
            if len(duration_url) > 0:
                data = etree.HTML(
                    requests.get(duration_url[0], headers=self.headers).text)
                duration = data.xpath(
                    "//p[contains(text(),'Full time')]//text()")
                clear_space(duration)
                print("duration: ", duration)
                duration_list = getIntDuration(''.join(duration))
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
            print("item['duration'] = ", item['duration'])
            print("item['duration_per'] = ", item['duration_per'])

            item['ielts'] = 6.0
            item['ielts_l'] = 5.5
            item['ielts_s'] = 5.5
            item['ielts_r'] = 5.5
            item['ielts_w'] = 5.5

            # https://www.chi.ac.uk/study-us/fees-finance/tuition-fees
            item['tuition_fee'] = 13000
            item[
                'require_chinese_en'] = """<p>Senior Secondary School Certificate PLUS an International Foundation</p>
<p>Year OR Senior Secondary School Certificate 80% +</p>"""
            # https://www.chi.ac.uk/international/how-apply/undergraduate-applications
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="col-2-3">
      
              <h1 class="title" id="page-title">Undergraduate Applications</h1>
      
      
        <div class="region region-content">
    <div id="block-system-main" class="block block-system">

    
  <div class="content">
    <div id="node-2130" class="node node-content-page node-readydeploy clearfix" about="/international/how-apply/undergraduate-applications" typeof="sioc:Item foaf:Document">
  
  <div class="content">
    <div class="field field-name-field-serif-intro field-type-text-long field-label-hidden"><div class="field-items"><div class="field-item even"><p>We recommend you apply online through  <a target="_blank" href="http://www.ucas.com/apply">UCAS</a>. International students may also apply directly to the University using the University of Chichester <a target="_blank" href="https://d3mcbia3evjswv.cloudfront.net/files/International%20Application%202017-2018_1_0.doc?dt7VFSaSnVZlYb1a1vvKmEfvuuqVmsqE">International Application Form.</a></p>
</div></div></div><div class="field field-name-body field-type-text-with-summary field-label-hidden"><div class="field-items"><div class="field-item even" property="content:encoded"><h3><span class="rangySelectionBoundary" id="selectionBoundary_1424440432166_7907926958422292" style="line-height: 0; display: none;"></span><strong>Applying via UCAS</strong></h3>
<p>You can apply for up to five different degree courses at up to five different institutions through UCAS (the national Universities and Colleges Admissions Service). Your application is sent to all five  universities which you have applied to at the same time.  There is no need to choose a first choice university at this stage.</p>
<p>Each course has a UCAS code that you will find on our <a href="/search/course-search">Course pages </a>or in our prospectus. You will need to know the UCAS code for the course you want to apply for when you make your application.</p>
<p><strong>Deadlines and important dates</strong></p>
<ul><li>1 September UCAS opens for applications for courses starting in September/October the following year.</li>
<li>15 January - Recommended application date for UK and other EU applicants.</li>
<li>30 June - Closing date for international (non-EU) applicants. (We do advise you to apply earlier if possible though.)</li>
<li>July / August - applications can still be submitted via UCAS but you can only apply to one university at a time in July and August (known as "Clearing")</li>
</ul><p>When you are applying to UCAS you will also need the UCAS institution code for the university. The UCAS code for the University of Chichester is <strong>(CHICH) C58</strong>.</p>
<p>Need further information or guidance on applying?</p>
<p>Then please either contact Admissions on +44 (0)1243 816002 or email <a href="mailto:[email protected]?subject=International%20application">[email protected]</a></p>
<h4><strong>Accepting an offer of a place</strong></h4>
<p>Your university offer(s) will be notified to you via your UCAS account and you can select a first ("firm") choice and, if you wish, a second ("insurance") choice via UCAS who will then inform the universities of your decision.</p>
<p><strong>Tuition fee deposit</strong></p>
<p>If you wish to accept an offer from the University of Chichester, you will be expected to pay a deposit of £2,000 before a UKVI Certificate of Acceptance for Studies (CAS) will be issued to you.</p>
<p>The deposit will be refunded, in full, if the University withdraws the programme.</p>
<p>Otherwise, the deposit will only be refunded, minus a £250 administration charge, if the applicant provides written evidence of being refused a visa to join the programme, through no fault of his or her own. Where the applicant has not disclosed relevant previous study, or does not have sufficient funds in the bank account for the relevant period, are examples of where it would be deemed the applicant's responsibility for not securing a visa.</p>
</div></div></div></div></div></div></div></div></div>"""
                ]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            if "/" in item['ucascode']:
                if len(item['ucascode']) > 20:
                    item['ucascode'] = ""
                print("///////////////")
                print("item['ucascode']1: ", item['ucascode'])
                ucascode_0 = item['ucascode'].split("/")
                if "/" in item['degree_name']:
                    degree_name_0 = item['degree_name'].split("/")
                else:
                    degree_name_0 = [item['degree_name'], item['degree_name']]
                for u in range(len(ucascode_0)):
                    item['ucascode'] = ucascode_0[u]
                    item['degree_name'] = degree_name_0[u]
                    yield item
            else:
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #27

파일 보기

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "The University of Edinburgh"
        # item['country'] = 'England'
        # item['website'] = 'https://www.ed.ac.uk/'
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            # 专业
            programme = response.xpath(
                "//h1[@itemprop='headline']//text()").extract()
            clear_space(programme)
            programme_en = ''.join(programme).strip()

            degree_name = re.findall(r"^.*?\s", programme_en)
            if len(degree_name) > 0:
                item['degree_name'] = degree_name[0].strip()
            print("item['degree_name']: ", item['degree_name'])

            item['programme_en'] = programme_en.replace(
                item['degree_name'], '').strip()
            print("item['programme_en']: ", item['programme_en'])

            department = response.xpath(
                "//div[@id='proxy_rightSummary']//p//span[contains(text(),'College:')]/../text()"
            ).extract()
            clear_space(department)
            item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            ucascode = response.xpath(
                "//span[contains(text(),'UCAS code:')]/../text()").extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            # print("item['ucascode']: ", item['ucascode'])

            duration = response.xpath(
                "//span[contains(text(),'Duration:')]/../text()").extract()
            clear_space(duration)
            # print("duration: ", duration)

            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            # //div[@class='col-xs-12']//div[@class='row']//div[@class='col-xs-12']//ul[@class='addressList']//li[@class='contactCampus']
            # location = response.xpath(
            #     "//div[@class='col-xs-12']//div[@class='row']//div[@class='col-xs-12']//ul[@class='addressList']//li[@class='contactCampus']/text()").extract()
            # clear_space(location)
            item[
                'location'] = '33 Buccleuch Place, City, Edinburgh, Post Code. EH8 9JS'
            # print("item['location']: ", item['location'])

            # # //option[@value='0010']
            # start_date = response.xpath(
            #     "//select[@name='code2']//option//text()").extract()
            # clear_space(start_date)
            # if len(start_date) > 1:
            #     item['start_date'] = start_date[0].strip()
            # # print("item['start_date']: ", item['start_date'])
            # item['start_date'] = getStartDate(item['start_date'])
            # print("item['start_date'] = ", item['start_date'])

            overview = response.xpath(
                "//div[@id='proxy_introduction']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            # //div[@id='proxy_collapseprogramme']
            modules = response.xpath(
                "//div[@id='proxy_collapseWhatStudy']/..").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(
                list(modules)))
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath(
                "//div[@id='proxy_collapseLearning']/..").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            career = response.xpath(
                "//div[@id='proxy_collapseCareers']/..").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            # //div[@id='proxy_collapseentry_req']
            # entry_requirements = response.xpath(
            #     "//div[@id='proxy_collapseentry_req']/..//text()").extract()
            # item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            alevel = response.xpath(
                "//li[contains(text(),'A Levels:')]//text()|//p[contains(text(),'A levels:')]//text()"
            ).extract()
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[-1]).strip()
            print("item['alevel'] = ", item['alevel'])

            # ib = response.xpath(
            #     "//html//ul[1]/li[3]/abbr[contains(text(),'IB')]/..//text()|//p[contains(text(),'IB:')]//text()").extract()
            ib = response.xpath(
                "//html//ul[3]/li[3]/abbr[contains(text(),'IB')]/..//text()|//p[contains(text(),'IB:')]//text()"
            ).extract()
            clear_space(ib)
            if len(ib) > 0:
                item['ib'] = ''.join(ib).strip()
            print("item['ib'] = ", item['ib'])

            IELTS = response.xpath(
                "//abbr[contains(text(),'IELTS')]/..//text()").extract()
            item['ielts_desc'] = ''.join(IELTS)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            TOEFL = response.xpath(
                "//abbr[contains(text(),'TOEFL')]/..//text()").extract()
            if len(TOEFL) == 0:
                TOEFL = response.xpath(
                    "//*[contains(text(),'TOEFL')]//text()").extract()
            item['toefl_desc'] = ''.join(TOEFL)
            # print("item['toefl_desc']: ", item['toefl_desc'])

            toeflDict = get_toefl(item['toefl_desc'])
            item['toefl'] = toeflDict.get("TOEFL")
            item['toefl_l'] = toeflDict.get("TOEFL_L")
            item['toefl_s'] = toeflDict.get("TOEFL_S")
            item['toefl_r'] = toeflDict.get("TOEFL_R")
            item['toefl_w'] = toeflDict.get("TOEFL_W")
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            tuition_feeDict = {}
            tuition_fee_url = response.xpath(
                "//html//div[@id='proxy_collapseFees']//p[1]/a/@href").extract(
                )
            print("tuition_fee_url: ", tuition_fee_url)
            if len(tuition_fee_url) > 0:
                tuition_fee_url_str = tuition_fee_url[0]
                fee = self.parse_tuition_fee(tuition_fee_url_str)
                clear_space(fee)
                fee_re = re.findall(r"£\d+,\d+", ''.join(fee))
                print("fee_re: ", fee_re)
                item['tuition_fee'] = getTuition_fee(''.join(fee_re))
                item['tuition_fee_pre'] = "£"
            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
                item['tuition_fee_pre'] = ""
            print("item['tuition_fee']: ", item['tuition_fee'])

            # https://www.ed.ac.uk/studying/international/country/asia/east-asia/china
            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<p class="lead">Undergraduate entry requirements for students from China.</p>


  <h2>Senior High School Certificate</h2>

<p>Students who have completed the Chinese Senior High School Certificate are required to undertake further study for entry to most subjects as this qualification does not normally meet our minimum entry requirements.</p>

<p>We accept the following qualifications for direct entry to our undergraduate degree programmes:</p>

<ul>
	<li><a class="uoe-node-link uoe-published" href="/studying/undergraduate/entry-requirements/ruk/a-levels" title="A Levels"><abbr title="General Certificate of Education">GCE</abbr> <abbr title="Advanced Level">A Levels</abbr></a></li>
	<li><a class="uoe-node-link uoe-published" href="/studying/undergraduate/entry-requirements/international/ib" >International Baccalaureate</a></li>
	<li><a class="uoe-node-link uoe-published" href="/studying/undergraduate/entry-requirements/scottish-qualifications/highers" title="SQA Highers and Advanced Highers">Scottish qualifications</a></li>
	<li><a class="uoe-node-link uoe-published" href="/studying/international/country/americas/united-states-of-america" title="United States of America"><abbr title="United States">US</abbr> qualifications</a></li>
</ul>

<p>Applicants with qualifications other than those listed above will usually be required to complete a Foundation Year before entering the University.</p>

<p><a class="uoe-node-link uoe-published" href="/studying/international/applying/foundation" title="International Foundation Programme">Foundation year</a></p>

<h2>Science and Engineering</h2>

<p>For degree programmes in Science and Engineering, applicants who have completed a year of study at a leading Chinese University may be eligible to apply.</p>

<p>The College of Science &amp; Engineering will also give consideration to applicants who have achieved excellent results in the Chinese National University Entrance Examination (Gaokao) on an individual basis.</p>

<h2>Further guidance on academic entry requirements</h2>

<p>Each course may have further specific entry requirements. All applicants must meet these requirements. Staff in the Admissions Offices will be able to provide further guidance.</p>

<p><a class="uoe-node-link uoe-published" href="/studying/undergraduate/contacts" title="Contact us with an enquiry about undergraduate study">Undergraduate admissions contacts</a></p>

<h2>English Language requirements</h2>

<p>If your first language is not English, you will also have to meet English Language requirements to apply. These requirements are listed by programme.</p>

<p><a class="uoe-node-link uoe-published" href="/studying/international/english" title="English language requirements">English Language advice</a></p>

<p><a class="uoe-node-link uoe-published" href="/studying/undergraduate/degrees" title="Degree finder">Specific English language requirement by programme</a></p>

<h2>Contact us</h2>

<p>Edinburgh Global's representative for China is Esther Sum.</p>

<p>Esther will help you with admissions advice and support.</p>

<p><a href="mailto:[email protected]">Contact us by email - [email protected]</a></p>

<h2>Support in your country</h2>

<p><a class="uoe-node-link uoe-published" href="/studying/international/application/our-visits-overseas" title="Our visits overseas">View a list of our overseas visits</a></p>

<p><a class="uoe-node-link uoe-published" href="/studying/international/agents/list/china" title="China">Our agents in your country</a></p>

<h2>Chat to us</h2>

<p>Talk to a member of staff online and view a presentation about study in Edinburgh.</p>

<p><a class="uoe-node-link uoe-published" href="/studying/international/application/chat-to-us-online" title="Online information sessions">Chat to us</a></p>

<h2>Join our mailing list</h2>

<p>We will send you further useful information about the University, admissions and entry.</p>

<p><a href="http://r1.dotmailer-surveys.com/0127judf-2e1gig1f">Join our mailing list </a></p>

<h2>About Edinburgh</h2>

<p><a class="uoe-node-link uoe-published" href="/about" title="About">More information about Edinburgh</a></p>

<p><a class="uoe-node-link uoe-published" href="/global/immigration/applying-for-visa/visa-requirements" >Do I need a visa?</a></p>

<h2>Student numbers</h2>

<p>There are almost 3,000 students students from China currently studying at the University of Edinburgh.</p>
"""
                ]))
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            item[
                'apply_proces_en'] = "https://www.ed.ac.uk/studying/undergraduate/applying"

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #28

파일 보기

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "The University of Sheffield"
        # item['country'] = 'England'
        # item['website'] = 'https://www.sheffield.ac.uk'
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = "Western Bank, Sheffield, S10 2TN, UK"
        print("===========================")
        print(response.url)
        try:
            # 专业、学位类型
            programmeDegree_type = response.xpath(
                "//div[@class='titles']/h2//text()").extract()
            if len(programmeDegree_type) == 0:
                programmeDegree_type = response.xpath(
                    "//main[@class='main content']/h2[1]//text()").extract()
            programmeDegree_type = ''.join(programmeDegree_type).strip()
            print("programmeDegree_type: ", programmeDegree_type)
            degree_typeList = re.findall(r"[A-Za-z/\(\)]*$",
                                         programmeDegree_type)
            # print("degree_typeList: ", degree_typeList)
            programme = programmeDegree_type
            if len(degree_typeList) != 0:
                degree_type = ''.join(list(degree_typeList[0]))
                item['degree_name'] = degree_type
                programme = programmeDegree_type.replace(
                    item['degree_name'], '')
            print("item['degree_name']: ", item['degree_name'])
            item['programme_en'] = ''.join(programme)
            print("item['programme_en']: ", item['programme_en'])

            # 学院
            department = response.xpath(
                "//div[@class='titles']//h3//text()").extract()
            clear_space(department)
            item['department'] = ''.join(department).strip()
            print("item['department']: ", item['department'])

            ucascode = response.xpath("//span[@id='adCode']//text()").extract()
            clear_space(ucascode)
            if len(ucascode) > 0:
                item['ucascode'] = ''.join(ucascode[0]).strip()
            print("item['ucascode'] = ", item['ucascode'])

            # 课程长度
            durationContent = response.xpath(
                "//h3[contains(text(),'Course details')]/following-sibling::text()"
            ).extract()
            clear_space(durationContent)
            # print(durationContent)

            duration_list = getIntDuration(''.join(durationContent))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # 专业描述
            overview = response.xpath("//div[@class='descHold']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            alevel = response.xpath(
                "//html//div[@id='courseSummary']//tr/td[contains(text(), 'A Levels')]/following-sibling::td//text()"
            ).extract()
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[0]).strip()
            # print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//html//div[@id='courseSummary']//tr/td[contains(text(), 'International Baccalaureate')]/following-sibling::td//text()"
            ).extract()
            item['ib'] = ''.join(ib).strip()
            # print("item['ib'] = ", item['ib'])

            ielts_desc = response.xpath(
                "//*[contains(text(),'IELTS')]//text()").extract()
            clear_space(ielts_desc)
            item['ielts_desc'] = ''.join(ielts_desc)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ieltDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltDict.get('IELTS')
            item["ielts_l"] = ieltDict.get('IELTS_L')  # float
            item["ielts_s"] = ieltDict.get('IELTS_S')  # float
            item["ielts_r"] = ieltDict.get('IELTS_R')  # float
            item["ielts_w"] = ieltDict.get('IELTS_W')
            # print("ielts = %s  ielts_l = %s  ielts_s = %s  ielts_r = %s  ielts_w = %s"%(
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            modules_en = response.xpath("//div[@id='modules']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules_en))
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath("//div[@id='ltam']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            career_en = response.xpath("//div[@id='graduates']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en']: ", item['career_en'])

            # https://www.sheffield.ac.uk/prospectus/courseDetails.do?id=N1202019
            # # start_date //a[@href='#tab00']
            # start_date = response.xpath(
            #     "//table[@class='cms-tabs']/tbody/tr[last()]/th[1]//text()").extract()
            # clear_space(start_date)
            # start_date_str = ''.join(start_date).replace('start', '').replace('entry', '').strip()
            # # print(start_date_str)
            # start_date_1 = getStartDate(start_date_str)
            # print(start_date_1)
            # item['start_date'] = start_date_1
            # print("item['start_date']: ", item['start_date'])

            # //div[@id='tab00']
            # modules   评估方式

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<h1>How to apply: applying essentials</h1>
    <p><img class="imgRight" alt="Undergraduates in a tutorial"   src="/polopoly_fs/1.550384!/image/undergraduates320.jpg" />This page provides key information about applying to study on an undergraduate course at Sheffield, and contains links to all of our procedures and Admissions policies. Please take the time to read this information before completing your application.</p>
    <h3>Before you apply</h3>
    <p>We normally expect applicants to offer three full A Levels or an accepted equivalent qualification. You can check the University's general entry requirements, including which UK and International qualifications we accept and our English language and mathematics requirements, on our Admissions requirements webpage:</p>
    <p><a  href="/undergraduate/apply/requirements">Admissions requirements</a></p>
    <p>You can find details of the entry requirements for particular courses in our online prospectus. Please note that these represent our typical offer conditions only – we may make different offers in some cases.</p>
    <p><a href="http://www.sheffield.ac.uk/prospectus">Online prospectus</a></p>
    <p>A full list of our formal policies relating to Admissions is available on our Policies webpages. This includes our Student Admissions Policy as well as policies on A Level subject combinations, resits, and qualifications taken early.</p>
    <p><a  href="/undergraduate/policies">Our policies</a></p>
    <h3>Applying</h3>
    <p>You can apply for an undergraduate course at Sheffield via UCAS (the Universities and Colleges Admissions Service):</p>
    <p><a href="http://www.ucas.com/apply">UCAS website – Apply</a></p>
    <p>Applications for places on courses starting the following September (except Medicine and Dentistry) should be submitted to UCAS between:</p>
    <ul>
        <li>1 September and 15 January to be guaranteed equal consideration with other applicants</li>
        <li>16 January and 30 June for further consideration, although we may not be able to consider your application if all the places on the course you have applied for have been filled</li>
    </ul>
    <p>Applications for places on Medicine and Dentistry courses must be submitted between 1 September and 15 October.</p>
    <p>You can find more information about how and when to apply on our Applying webpage. This also contains information about deferred entry, direct entry to year/level 2 and our foundation year courses.</p>
    <p><a  href="/undergraduate/apply/applying">Applying</a></p>
    <p>Our Education For All webpage provides information on the support we provide for Care Leavers, estranged students, carers, mature students and students with a disability or learning difficulty. You can also find information about our outreach activities, our use of contextual data and our Disrupted Studies scheme.</p>
    <p><a  href="/undergraduate/apply/wp">Education for all: Widening Participation and Disrupted Studies</a></p>
    <h3>After you apply</h3>
    <p>You can find out what happens after you have submitted your application on our <a  href="/undergraduate/apply/after">After You Apply</a> webpages. If we make you a Conditional offer and you accept us as either your Firm or Insurance choice, we will also send you an email containing information about what happens when you get your exam results.</p>
    <p><a  href="/undergraduate/apply/after">After you apply</a></p>
    <p>If at any time you find that your studies are&#160;affected by personal, social or domestic issues, please let us know by using our Disrupted Studies form:</p>
    <p><a  href="/undergraduate/apply/applying/disrupted">Disrupted Studies</a></p>
    <h3>Further information</h3>
    <p>If you have any further questions about the University and applying to study with us, please <a href="http://ask.sheffield.ac.uk">Ask Sheffield</a>.</p>
    <p>If you still need help, our Applicant Information Desk (AiD) provides a first point of contact for people who have applied to the University. AiD can help with any questions you have about the process of applying to us and the current status of your application.</p>
    <p><a  href="/aid">Applicant Information Desk</a></p>
    <p>We wish you the best of luck with your application.</p>"""
                ]))
            item['require_chinese_en'] = ''

            tuition_feeDict = {
                "C180": "21450",
                "C200": "21450",
                "C300": "21450",
                "C100": "21450",
                "C109": "21450",
                "C189": "21450",
                "C209": "21450",
                "C309": "21450",
                "C1C9": "21450",
                "C1CX": "21450",
                "C1R9": "21450",
                "C101": "21450",
                "F400": "18900",
                "FV41": "18900",
                "VV46": "18900",
                "VR47": "18900",
                "VR41": "18900",
                "VR42": "18900",
                "F410": "18900",
                "VR44": "18900",
                "QV84": "18900",
                "F401": "18900",
                "KK13": "21450",
                "K100": "21450",
                "ARCU123": "21450",
                "ARCU124": "21450",
                "ARCU13": "21450",
                "ARCU129": "21450",
                "Y001": "16800",
                "H130": "21450",
                "G500": "21450",
                "H690": "21450",
                "H660": "21450",
                "H310": "21450",
                "H360": "21450",
                "H361": "21450",
                "H1NF": "21450",
                "H1NF": "21450",
                "HN62": "21450",
                "OG31": "21450",
                "8L16": "21450",
                "57": "21450",
                "2G36": "21450",
                "8M74": "21450",
                "2A47": "21450",
                "H653": "21450",
                "H659": "21450",
                "B900": "21450",
                "B909": "21450",
                "H810": "21450",
                "H800": "21450",
                "H840": "21450",
                "H8T9": "21450",
                "H8F1": "21450",
                "H8J7": "21450",
                "H801": "21450",
                "F100": "21450",
                "F105": "21450",
                "F107": "21450",
                "F106": "21450",
                "F335": "21450",
                "F109": "21450",
                "F108": "21450",
                "C720": "21450",
                "H210": "21450",
                "HK21": "21450",
                "H2T9": "21450",
                "H200": "21450",
                "H202": "21450",
                "HK2D": "21450",
                "H2N2": "21450",
                "2H26": "21450",
                "8T63": "21450",
                "8L55": "21450",
                "2G91": "21450",
                "H201": "21450",
                "A200": "21450",
                "G600": "21450",
                "G650": "21450",
                "G402": "21450",
                "G400": "21450",
                "GG41": "21450",
                "GG74": "21450",
                "G4G1": "21450",
                "G700": "21450",
                "G490": "21450",
                "G495": "21450",
                "G401": "21450",
                "G651": "21450",
                "GN52": "21450",
                "GN53": "21450",
                "X301": "16800",
                "F401": "18900",
                "Q305": "16800",
                "Q310": "16800",
                "F901": "18900",
                "L701": "18900",
                "V101": "16800",
                "Q307": "16800",
                "V501": "16800",
                "L301": "16800",
                "L401": "16800",
                "K401": "16800",
                "K441": "16800",
                "L790": "16800",
                "QC19": "21450",
                "B990": "21450",
                "C801": "21450",
                "V642": "16800",
                "L432": "16800",
                "T210": "16800",
                "T300": "18900",
                "TN42": "18900",
                "T110": "16800",
                "T415": "16800",
                "TN12": "18900",
                "T1T2": "16800",
                "T4T2": "16800",
                "T1R2": "16800",
                "T2R2": "16800",
                "T1R4": "16800",
                "T2R4": "16800",
                "T1R7": "16800",
                "T2R7": "16800",
                "T1R1": "16800",
                "TV11": "16800",
                "TV21": "16800",
                "L100": "16800",
                "LV15": "16800",
                "LL12": "16800",
                "L101": "16800",
                "LG11": "16800",
                "L1N3": "16800",
                "LIN3": "16800",
                "X300": "16800",
                "X301": "16800",
                "H620": "21450",
                "H621": "21450",
                "H610": "21450",
                "H613": "21450",
                "H614": "21450",
                "H651": "21450",
                "H647": "21450",
                "H645": "21450",
                "H6T9": "21450",
                "H623": "21450",
                "H615": "21450",
                "H616": "21450",
                "H652": "21450",
                "H649": "21450",
                "H622": "21450",
                "H611": "21450",
                "H648": "21450",
                "H629": "21450",
                "H628": "21450",
                "H602": "21450",
                "H603": "21450",
                "H100": "21450",
                "H104": "21450",
                "H675": "21450",
                "H673": "21450",
                "H67I": "21450",
                "H67H": "21450",
                "Q3Q1": "16800",
                "QL33": "16800",
                "QR14": "16800",
                "QR17": "16800",
                "QR32": "16800",
                "QR37": "16800",
                "QV15": "16800",
                "QT12": "16800",
                "Q304": "16800",
                "Q310": "16800",
                "Q305": "16800",
                "Q306": "16800",
                "QR31": "16800",
                "QV31": "16800",
                "QW33": "16800",
                "QV35": "16800",
                "QR34": "16800",
                "QW34": "16800",
                "Q307": "16800",
                "F309": "21450",
                "G109": "21450",
                "QR11": "16800",
                "R120": "16800",
                "RL11": "16800",
                "RL12": "16800",
                "RN12": "16800",
                "RR12": "16800",
                "RR14": "16800",
                "RR17": "16800",
                "RV11": "16800",
                "RV15": "16800",
                "RW13": "16800",
                "R1R9": "16800",
                "R1T2": "16800",
                "R1R7": "16800",
                "R1RR": "16800",
                "R1RO": "16800",
                "L700": "18900",
                "F800": "18900",
                "F902": "18900",
                "F900": "18900",
                "F901": "18900",
                "QR12": "16800",
                "R220": "16800",
                "RL21": "16800",
                "RL22": "16800",
                "RN22": "16800",
                "RR24": "16800",
                "RR27": "16800",
                "RV21": "16800",
                "RV25": "16800",
                "RW23": "18900",
                "R2R9": "16800",
                "R2T2": "16800",
                "R2R7": "16800",
                "R2RR": "16800",
                "R2R3": "16800",
                "R410": "16800",
                "RL42": "16800",
                "RN42": "16800",
                "RL41": "16800",
                "R4T2": "16800",
                "R4R7": "16800",
                "R4RR": "16800",
                "V100": "16800",
                "RV71": "16800",
                "RV41": "16800",
                "VV15": "16800",
                "VL12": "16800",
                "VL13": "16800",
                "V1R9": "16800",
                "V101": "16800",
                "B620": "21450",
                "QC18": "21450",
                "QC19": "21450",
                "P110": "18900",
                "P500": "18900",
                "K3K4": "18900",
                "KC39": "18900",
                "M100": "16800",
                "ML94": "16800",
                "M1R4": "16800",
                "M1R2": "16800",
                "M1R1": "16800",
                "M930": "16800",
                "M120": "16800",
                "N200": "16800",
                "N420": "16800",
                "NG21": "16800",
                "NG41": "16800",
                "NL21": "16800",
                "NL41": "16800",
                "NP21": "16800",
                "NP41": "16800",
                "NT22": "16800",
                "N120": "16800",
                "JH51": "21450",
                "J500": "21450",
                "J5R9": "21450",
                "FH21": "21450",
                "J200": "21450",
                "FHF1": "21450",
                "H403": "21450",
                "H401": "21450",
                "JH5P": "21450",
                "JH56": "21450",
                "J501": "21450",
                "G100": "18900",
                "G103": "18900",
                "GN13": "18900",
                "G102": "18900",
                "G1R4": "18900",
                "G1R1": "18900",
                "G1R2": "18900",
                "G106": "18900",
                "VG51": "18900",
                "A100": "21450",
                "T900": "16800",
                "C400": "21450",
                "C500": "21450",
                "C440": "21450",
                "C700": "21450",
                "C741": "21450",
                "CC45": "21450",
                "CC74": "21450",
                "CC75": "21450",
                "C709": "21450",
                "CC7C": "21450",
                "CC79": "21450",
                "C409": "21450",
                "CC4C": "21450",
                "C749": "21450",
                "C509": "21450",
                "C449": "21450",
                "C790": "21450",
                "C791": "21450",
                "CC47": "21450",
                "CC4R": "21450",
                "C431": "21450",
                "C433": "21450",
                "C521": "21450",
                "C523": "21450",
                "W302": "18900",
                "RW43": "18900",
                "VW53": "18900",
                "WT34": "18900",
                "WT31": "18900",
                "WTH4": "18900",
                "B991": "21450",
                "B740": "21450",
                "B990": "21450",
                "B520": "21450",
                "QV36": "16800",
                "RV26": "16800",
                "QV16": "16800",
                "VW63": "16800",
                "VV56": "16800",
                "VR61": "16800",
                "BIBU08": "16800",
                "V641": "16800",
                "V500": "16800",
                "RV45": "16800",
                "V501": "16800",
                "F300": "21450",
                "F301": "21450",
                "F344": "21450",
                "F350": "21450",
                "FF35": "21450",
                "F371": "21450",
                "F3F5": "21450",
                "FV35": "21450",
                "F321": "21450",
                "F3G4": "21450",
                "F3GK": "21450",
                "F305": "21450",
                "F304": "21450",
                "F3F5": "21450",
                "L210": "16800",
                "LL23": "16800",
                "LV25": "16800",
                "L201": "16800",
                "LL24": "16800",
                "C800": "21450",
                "C802": "21450",
                "C801": "21450",
                "R710": "16800",
                "RL71": "16800",
                "RL72": "16800",
                "RN72": "16800",
                "RR47": "16800",
                "R7R7": "16800",
                "R7RR": "16800",
                "RV75": "16800",
                "RW73": "18900",
                "R7T2": "16800",
                "L300": "16800",
                "LL43": "16800",
                "NL2K": "16800",
                "NL24": "16800",
                "L391": "16800",
                "L301": "16800",
                "L401": "16800",
                "L722": "16800",
                "TRPU105": "16800",
                "LK74": "18900",
                "K401": "16800",
                "K441": "16800",
                "L790": "16800",
            }
            tuition_fee = tuition_feeDict.get(item['ucascode'])
            print("tuition_fee: ", tuition_fee)
            if tuition_fee != None:
                item['tuition_fee'] = int(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']: ", item['tuition_fee'])
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #29

파일 보기

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.port.ac.uk/"
        item['university'] = "University of Portsmouth"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item[
            'location'] = 'University House, Winston Churchill Avenue, Portsmouth PO1 2UP'
        print("===========================")
        print(response.url)
        try:
            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            programme = response.xpath("//h1[@class='Title']/text()").extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            degree_type = response.xpath(
                "//h1[@class='Title']/small//text()").extract()
            item['degree_name'] = ''.join(degree_type).replace("(Hons)",
                                                               "").strip()
            print("item['degree_name']: ", item['degree_name'])

            ucascode = response.xpath(
                "//nobr[contains(text(),'UCAS Code')]/../following-sibling::*//text()"
            ).extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).replace("UCAS Code:",
                                                         "").strip()
            # print("item['ucascode'] = ", item['ucascode'])

            item['start_date'] = response.meta.get(response.url)
            # print("item['start_date'] = ", item['start_date'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            # department = response.xpath(
            #     "//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()").extract()
            # item['department'] = ''.join(department)
            # print("item['department']: ", item['department'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            duration = response.xpath(
                "//div[contains(text(),'Duration')]/following-sibling::*//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_str = ''.join(duration)
            item['other'] = duration_str

            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            location = response.xpath(
                "//div[contains(text(),'Location')]/following-sibling::*//text()"
            ).extract()
            item['location'] = ''.join(location)
            # print("item['location']: ", item['location'])

            # //strong[contains(text(),'International students')]/../following-sibling::p[1]
            tuition_fee = response.xpath(
                "//h3[contains(text(),'Tuition fees')]/..//*[contains(text(),'International students')]//text()"
            ).extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"£\d+,\d+", ''.join(tuition_fee))
            # print("tuition_fee_re: ", tuition_fee_re)
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = int(tuition_fee_re[0].replace(
                    ",", "").replace("£", "").strip())
            # print("item['tuition_fee']: ", item['tuition_fee'])

            overview = response.xpath(
                """//h2[@id='overview']/..|//h3[contains(text(),"What you'll experience")]/..|
            //h3[contains(text(),'What you’ll experience')]/..|//*[contains(text(),"What you'll experience")]/../.."""
            ).extract()
            item['overview_en'] = remove_class(
                clear_lianxu_space(overview)).replace(
                    "<h3>What you'll experience</h3>", "").strip()
            print("item['overview_en']: ", item['overview_en'])

            career = response.xpath(
                "//h3[contains(text(),'Careers and opportunities')]/.."
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            rntry_requirements_content = response.xpath(
                "//div[contains(text(),'Entry Requirements')]/../../..//div[contains(text(),'2019 start')]/../../../..//text()"
            ).extract()
            rntry_requirements_str = clear_lianxu_space(
                rntry_requirements_content)

            ieltsList = response.xpath(
                "//*[contains(text(),'English language proficiency')]/text()|"
                "//*[contains(text(),'English Language proficiency')]/text()"
            ).extract()
            # print(ieltsList)
            if len(ieltsList) == 0:
                ieltsList = re.findall(r".{1,45}IELTS.{1,85}",
                                       rntry_requirements_str)
            clear_space(ieltsList)
            if len(ieltsList) > 0:
                item['ielts_desc'] = ''.join(ieltsList[1:]).strip()
                if item['ielts_desc'] == "":
                    item['ielts_desc'] = ''.join(ieltsList).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            alevel = response.xpath(
                "//*[contains(text(),'A levels')]/text()").extract()
            # print(ieltsList)
            if len(alevel) == 0:
                alevel = re.findall(r".{1,45}A\slevels.{1,85}",
                                    rntry_requirements_str)
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[1:]).strip()
                if item['alevel'] == "":
                    item['alevel'] = ''.join(alevel).strip()
            print("item['alevel']: ", item['alevel'])

            modules = response.xpath(
                "//h2[@id='What youll study']/..|//h2[@id='What youll study']/../following-sibling::div[1]|//div[contains(text(),'Units currently being studied')]/../../.."
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            teaching_assessment = response.xpath(
                "//h2[@id='Teaching']/..|//h2[@id='Teaching']/../following-sibling::*[1]|//h2[@id='How youre assessed']/..|//h2[@id='How youre assessed']/../following-sibling::*[1]"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(teaching_assessment))
            # print("item['assessment_en']: ", item['assessment_en'])

            apply_proces_en = response.xpath(
                "//h2[@id='Apply']/..|//h2[@id='Apply']/../following-sibling::*"
            ).extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(apply_proces_en))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            item['apply_documents_en'] = remove_class(
                clear_lianxu_space([
                    """<h2 style="color: #384047; margin: 33px 0px 0.7em; padding: 0px;">What you'll need to send us</h2>
<p style="color: #384047; margin: 0px 0px 25px; border: none;">When you apply to join us, we'll need to see the following documents:</p>
<ul style="color: #384047; margin: 0px 0px 25px; padding-left: 35px; border: none; list-style-image: initial;">
    <li style="margin-top: 0px;">A completed application form</li>
    <li>A Personal Statement or Statement of Purpose</li>
    <li>Officially certified and translated copies of your high school or college qualification and grades (for undergraduate courses)</li>
    <li>Officially certified and translated copies of your degree qualification and grades (for Postgraduate courses)</li>
    <li>Proof of your English language level (such as an IELTS Certificate)</li>
    <li style="margin-bottom: 0px;">One academic reference on official headed paper for undergraduate courses or two references for postgraduate courses</li>
</ul>"""
                ]))
            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h3>Undergraduate courses</h3>
<p>If you've completed the Chinese Senior High School Diploma plus one year at a recognised university in China, we'll consider you for admission onto an undergraduate course such as a Bachelor's degree. You must have studied relevant subjects and achieved strong grades.</p>
<p>If you don't have a Chinese Senior High School Diploma, you can apply with:</p>
<h4>A levels</h4>
<ul>
    <li>Most courses will require 120 UCAS points. Your A level grades should equal or exceed the total points required. You can use the&nbsp;<a rel="noopener noreferrer" rel="noopener noreferrer" href="https://www.ucas.com/ucas/tariff-calculator"></a><a rel="noopener noreferrer" href="https://www.ucas.com/ucas/tariff-calculator" target="_blank">UCAS Tariff Calculator</a>&nbsp;to work out your total points. Please check your specific course page to find the exact number of points.</li>
    <li>Some courses will require you to have studied specific subjects at A level. For example, to study a science course you will usually need to have achieved passing grades in scientific subjects at A level.</li>
    <li>A level points: A* = 56 A = 48 B = 40 C = 32 D = 24.</li>
</ul>
<h4>International Baccalaureate</h4>
<ul>
    <li>Most courses will require between 24 and 31 points in the International Baccalaureate (IB), depending on the degree you apply for.</li>
</ul>
<p>You may also be considered for advanced entry onto a relevant undergraduate degree programme if you have a College Graduation Diploma (Dazhuan) from a recognised university or college on completion of two to three years of study, or a BTEC HND or SQA HND Higher National Diploma in a relevant subject.</p>
<p>You may be able to join an undergraduate course with other qualifications. We do consider qualifications from a range of sources. Contact us to find out more.</p>"""
                ]))
            item[
                "ib"] = "Most courses will require between 24 and 31 points in the International Baccalaureate (IB), depending on the degree you apply for."
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

예제 #30

파일 보기

파일: NottinghamTrentUniversity_U.py 프로젝트: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.ntu.ac.uk/"
        item['university'] = "Nottingham Trent University"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===============================")
        # print(response.url)
        print(item['url'])
        try:
            # 专业、学位类型
            programme = response.xpath(
                "//h1[@class='course-heading page-heading']//text()").extract(
                )
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en'] = ", item['programme_en'])

            degree_type = response.xpath(
                "//h2[@class='js_qualification']/strong//text()").extract()
            item['degree_name'] = ''.join(degree_type)
            print("item['degree_name'] = ", item['degree_name'])

            # //div[@id='tabs-key-info']/div[@class='tab tab-1 active-tab']/p[3]/span
            location = response.xpath(
                "//span[@class='location save']//text()").extract()
            item['location'] = ''.join(location).strip()
            # print("item['location'] = ", item['location'])

            start_date = response.xpath(
                "//strong[contains(text(),'Starting:')]/following-sibling::span//text()"
            ).extract()
            # print(start_date)
            item['start_date'] = ''.join(start_date)
            # print("item['start_date'] = ", item['start_date'])
            item['start_date'] = getStartDate(item['start_date'])
            # print("item['start_date']1 = ", item['start_date'])

            # //html//div[@class='content']/div[1]/div  专业描述
            overview = response.xpath(
                "//div[@id='what-you-will-study']/preceding-sibling::*"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en'] = ", item['overview_en'])

            # modules   课程设置
            modules = response.xpath(
                "//div[@id='what-you-will-study']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en'] = ", item['modules_en'])

            # teaching_assessment   评估方式
            teaching_assessment = response.xpath(
                "//div[@id='how-youre-taught']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(teaching_assessment))
            # print("item['assessment_en'] = ", item['assessment_en'])

            # career   评估方式
            career = response.xpath(
                "//div[@id='careers-and-employability']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en'] = ", item['career_en'])

            # //div[@id='entry-requirements-1']
            entry_requirements = response.xpath(
                "//div[@id='entry-requirements-0']").extract()
            item['apply_desc_en'] = remove_class(
                clear_lianxu_space(entry_requirements))
            print("item['apply_desc_en'] = ", item['apply_desc_en'])

            # //div[@id='entry-requirements-1']
            how_to_apply = response.xpath(
                "//div[@id='how-to-apply-1']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en'] = ", item['apply_proces_en'])

            # //div[@id='how-to-apply-1']//h3[contains(text(),'Interview')]/following-sibling::p[position()<3]
            interview_desc_en = response.xpath(
                "//div[@id='how-to-apply-1']//h3[contains(text(),'Interview')]/following-sibling::p[position()<3]"
            ).extract()
            item['interview_desc_en'] = remove_class(
                clear_lianxu_space(interview_desc_en))
            # print("item['interview_desc_en'] = ", item['interview_desc_en'])

            # deadline
            deadline = response.xpath(
                "//div[@id='how-to-apply-1']//p//strong[contains(text(),'Application closing date')]/../following-sibling::p[1]//text()|//div[@id='how-to-apply-1']//h3[contains(text(),'Application deadline')]/following-sibling::p[1]//text()"
            ).extract()
            clear_space(deadline)
            # print("deadline: ", deadline)
            deadline_str = ''.join(deadline)
            item['deadline'] = getStartDate(deadline_str)
            # print("item['deadline'] = ", item['deadline'])

            alevel = response.xpath(
                "//div[@id='entry-requirements-0']//li[contains(text(),'A-levels')]//text()"
            ).extract()
            clear_space(alevel)
            # print(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[0]).strip()
            # print("item['alevel'] = ", item['alevel'])

            if len(item['alevel']) > 160:
                item['alevel'] = ''.join(item['alevel'][:161])
            # print("item['alevel']1 = ", item['alevel'])

            # https://www.ntu.ac.uk/international/scholarships-and-fees/tuition-fees
            tuition_fee = response.xpath(
                "//html//div[@id='fees-and-funding-1']//text()").extract()
            clear_space(tuition_fee)
            # print(tuition_fee)
            tuition_fee = getTuition_fee(''.join(tuition_fee))
            item['tuition_fee'] = tuition_fee
            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
            else:
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee'] = ", item['tuition_fee'])
            # print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            departmentDict = {
                "Economics with Business":
                "Nottingham Business School",
                "Animal Health and Welfare":
                "School of Animal, Rural and Environmental Sciences",
                "Applied Anthrozoology":
                "School of Animal, Rural and Environmental Sciences",
                "Biodiversity Conservation":
                "School of Animal, Rural and Environmental Sciences",
                "Endangered Species Recovery and Conservation":
                "School of Animal, Rural and Environmental Sciences",
                "Equine Health and Welfare":
                "School of Animal, Rural and Environmental Sciences",
                "Equine Performance":
                "School of Animal, Rural and Environmental Sciences",
                "Equine Performance, Health and Welfare":
                "School of Animal, Rural and Environmental Sciences",
                "Global Food Security and Development":
                "School of Animal, Rural and Environmental Sciences",
                "Architecture":
                "School of Architecture, Design and the Built Environment",
                "Architecture (ARB/RIBA Part 2)rch":
                "School of Architecture, Design and the Built Environment",
                "Building Surveying":
                "School of Architecture, Design and the Built Environment",
                "Civil Engineering":
                "School of Architecture, Design and the Built Environment",
                "Construction Management":
                "School of Architecture, Design and the Built Environment",
                "Construction Project Management (Online)":
                "School of Architecture, Design and the Built Environment",
                "Interior Architecture and Design":
                "School of Architecture, Design and the Built Environment",
                "International Real Estate Investment and Finance":
                "School of Architecture, Design and the Built Environment",
                "Planning and Development":
                "School of Architecture, Design and the Built Environment",
                "Project Management (Construction)":
                "School of Architecture, Design and the Built Environment",
                "Quantity Surveying":
                "School of Architecture, Design and the Built Environment",
                "Real Estate":
                "School of Architecture, Design and the Built Environment",
                "Structural Engineering with Management":
                "School of Architecture, Design and the Built Environment",
                "Structural Engineering with Materials":
                "School of Architecture, Design and the Built Environment",
                "Animation":
                "School of Art & Design",
                "Commercial Photography":
                "School of Art & Design",
                "Culture, Style and Fashion":
                "School of Art & Design",
                "Branding and Identity":
                "School of Art & Design",
                "Fashion Communications":
                "School of Art & Design",
                "Fashion Design":
                "School of Art & Design",
                "Fashion Knitwear Design":
                "School of Art & Design",
                "Fashion Marketing":
                "School of Art & Design",
                "Fine Art":
                "School of Art & Design",
                "Graphic Design":
                "School of Art & Design",
                "Illustration":
                "School of Art & Design",
                "International Fashion Management":
                "School of Art & Design",
                "Luxury Fashion Brand Management":
                "School of Art & Design",
                "Photography":
                "School of Art & Design",
                "Textile Design Innovation":
                "School of Art & Design",
                "Culture, Style and Fashion":
                "School of Art & Design",
                "Fashion Communications":
                "School of Art & Design",
                "Fashion Marketing":
                "School of Art & Design",
                "Fashion and Textile Design":
                "School of Art & Design",
                "Fine Art":
                "School of Art & Design",
                "Graphic Design Theory and Practice":
                "School of Art & Design",
                "International Fashion Management":
                "School of Art & Design",
                "Luxury Fashion Brand Management":
                "School of Art & Design",
                "Photography":
                "School of Art & Design",
                "PG Cert Creative Pattern Cutting (15 weeks)":
                "School of Art & Design",
                "Art and Design Professional Doctorate":
                "School of Art & Design",
                "Art and Design":
                "School of Art & Design",
                "Broadcast Journalism":
                "School of Arts and Humanities",
                "Digital and Newspaper Journalism":
                "School of Arts and Humanities",
                "Magazine Journalism":
                "School of Arts and Humanities",
                "Documentary Journalism":
                "School of Arts and Humanities",
                "Media and Globalisation":
                "School of Arts and Humanities",
                "Creative Writing":
                "School of Arts and Humanities",
                "English Literary Research":
                "School of Arts and Humanities",
                "Linguistics":
                "School of Arts and Humanities",
                "Philosophy":
                "School of Arts and Humanities",
                "History":
                "School of Arts and Humanities",
                "PGCert Museum and Heritage Development":
                "School of Arts and Humanities",
                "Holocaust and Genocide":
                "School of Arts and Humanities",
                "International Development":
                "School of Arts and Humanities",
                "English Language Teaching":
                "School of Arts and Humanities",
                "TESOL (Teaching English to Speakers of Other Languages)":
                "School of Arts and Humanities",
                "Management":
                "Nottingham Business School",
                "Management and Finance":
                "Nottingham Business School",
                "Management and Global Supply Chain Management":
                "Nottingham Business School",
                "Management and Innovation and Enterprise":
                "Nottingham Business School",
                "Management and International Business":
                "Nottingham Business School",
                "Management and Marketing":
                "Nottingham Business School",
                "Marketing":
                "Nottingham Business School",
                "Branding and Advertising":
                "Nottingham Business School",
                "Digital Marketing":
                "Nottingham Business School",
                "Management and Marketing":
                "Nottingham Business School",
                "fees, funding and scholarships":
                "Nottingham Business School",
                "Return to all courses":
                "Nottingham Business School",
                "Human resource Management":
                "Nottingham Business School",
                "Economics":
                "Nottingham Business School",
                "Economics and Investment Banking":
                "Nottingham Business School",
                "International Business":
                "Nottingham Business School",
                "International Business (Dual Award) ":
                "Nottingham Business School",
                "Management and International Business":
                "Nottingham Business School",
                "Management and International Publishing":
                "Nottingham Business School",
                "Management and Global Supply Chain Management":
                "Nottingham Business School",
                "Finance":
                "Nottingham Business School",
                "Finance and Accounting":
                "Nottingham Business School",
                "Finance and Investment Banking":
                "Nottingham Business School",
                "Management and Finance":
                "Nottingham Business School",
                "Economics and Investment Banking":
                "Nottingham Business School",
                "Entrepreneurship":
                "Nottingham Business School",
                "Project Management":
                "Nottingham Business School",
                "Management":
                "Nottingham Business School",
                "Management and International Business":
                "Nottingham Business School",
                "Marketing":
                "Nottingham Business School",
                "Branding and Advertising":
                "Nottingham Business School",
                "Finance":
                "Nottingham Business School",
                "International Business":
                "Nottingham Business School",
                "Assessment Only Route to QTS (Primary) - Non-NTU Award":
                "Nottingham Institute of Education",
                "Assessment Only Route to QTS (Secondary) - Non-NTU Award":
                "Nottingham Institute of Education",
                "Early Years Initial Teacher Training":
                "Nottingham Institute of Education",
                "Early Years Initial Teacher Training (Assessment Only) - Non-NTU Award":
                "Nottingham Institute of Education",
                "Education":
                "Nottingham Institute of Education",
                "English Language Teaching":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training (with English and Literacy)":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training (with Mathematics and Numeracy)":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training (with Science, Engineering and Technology)":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training (with Special and Inclusive Practice)":
                "Nottingham Institute of Education",
                "Primary Education":
                "Nottingham Institute of Education",
                "Primary: School-Centred Initial Teacher Training (SCITT)":
                "Nottingham Institute of Education",
                "School Direct Training Programme (Primary salaried)":
                "Nottingham Institute of Education",
                "School Direct Training Programme (Primary)":
                "Nottingham Institute of Education",
                "School Direct Training Programme (Secondary salaried)":
                "Nottingham Institute of Education",
                "School Direct Training Programme (Secondary)":
                "Nottingham Institute of Education",
                "Secondary Biology":
                "Nottingham Institute of Education",
                "Secondary Business Education":
                "Nottingham Institute of Education",
                "Secondary Chemistry":
                "Nottingham Institute of Education",
                "Secondary Computer Science with ICT":
                "Nottingham Institute of Education",
                "Secondary Education (Design and Technology)":
                "Nottingham Institute of Education",
                "Secondary Education (Physics)":
                "Nottingham Institute of Education",
                "Secondary English":
                "Nottingham Institute of Education",
                "Secondary Mathematics":
                "Nottingham Institute of Education",
                "Secondary Music":
                "Nottingham Institute of Education",
                "Special Educational Needs Coordination - National Award":
                "Nottingham Institute of Education",
                "Teaching English to Speakers of Other Languages (TESOL)":
                "Nottingham Institute of Education",
                "Corporate and Insolvency Law":
                "Nottingham Law School",
                "Dual LLM in Corporate and Insolvency Law / European and Insolvency Law":
                "Nottingham Law School",
                "General Law":
                "Nottingham Law School",
                "Health Law and Ethics":
                "Nottingham Law School",
                "Human Rights and Justice":
                "Nottingham Law School",
                "Intellectual Property Law":
                "Nottingham Law School",
                "International Financial Law":
                "Nottingham Law School",
                "International Trade and Commercial Law":
                "Nottingham Law School",
                "Oil, Gas and Mining Law":
                "Nottingham Law School",
                "Sports Law":
                "Nottingham Law School",
                "Corporate and Insolvency Law":
                "Nottingham Law School",
                "International Trade and Commercial Law":
                "Nottingham Law School",
                "Legal Practice":
                "Nottingham Law School",
                "Oil, Gas and Mining Law":
                "Nottingham Law School",
                "Biomedical Science":
                "School of Science and Technology",
                "Biomedical Science (Flexible Learning)":
                "School of Science and Technology",
                "Neuropharmacology":
                "School of Science and Technology",
                "Pharmacology":
                "School of Science and Technology",
                "Molecular Microbiology":
                "School of Science and Technology",
                "Biotechnology":
                "School of Science and Technology",
                "Molecular Cell Biology":
                "School of Science and Technology",
                "Environmental Management":
                "School of Science and Technology",
                "Biotechnology":
                "School of Science and Technology",
                "Cancer Biology":
                "School of Science and Technology",
                "Cell Biology":
                "School of Science and Technology",
                "Molecular Biology":
                "School of Science and Technology",
                "Molecular Microbiology":
                "School of Science and Technology",
                "Neuropharmacology":
                "School of Science and Technology",
                "Pharmacology":
                "School of Science and Technology",
                "Environmental Management":
                "School of Science and Technology",
                "Biomedical Science (Flexible Learning)":
                "School of Science and Technology",
                "Environmental Management":
                "School of Science and Technology",
                "Chemistry / Chemistry (Professional Practice)":
                "School of Science and Technology",
                "Pharmaceutical and Medicinal Science":
                "School of Science and Technology",
                "Pharmaceutical Analysis":
                "School of Science and Technology",
                "Analytical Chemistry":
                "School of Science and Technology",
                "Chemistry":
                "School of Science and Technology",
                "Advanced Materials Engineering":
                "School of Science and Technology",
                "Forensic Science":
                "School of Science and Technology",
                "Computer Science":
                "School of Science and Technology",
                "Cloud and Enterprise Computing":
                "School of Science and Technology",
                "IT Security":
                "School of Science and Technology",
                "Engineering (Electronics)":
                "School of Science and Technology",
                "Engineering (Cybernetics and Communications)":
                "School of Science and Technology",
                "Engineering Management":
                "School of Science and Technology",
                "Computing Systems":
                "School of Science and Technology",
                "Data Analytics for Business":
                "School of Science and Technology",
                "Computer Science":
                "School of Science and Technology",
                "Electronic Systems":
                "School of Science and Technology",
                "Online MBA with Data Analytics":
                "School of Science and Technology",
                "Mathematical Sciences":
                "School of Science and Technology",
                "Data Analytics for Business":
                "School of Science and Technology",
                "Online MBA with Data Analytics":
                "School of Science and Technology",
                "Medical and Materials Imaging":
                "School of Science and Technology",
                "Medical Imaging":
                "School of Science and Technology",
                "Physics":
                "School of Science and Technology",
                "Physics":
                "School of Science and Technology",
                "Sport Science":
                "School of Science and Technology",
                "Exercise Physiology":
                "School of Science and Technology",
                "Performance Nutrition":
                "School of Science and Technology",
                "Performance Analysis":
                "School of Science and Technology",
                "Biomechanics":
                "School of Science and Technology",
                "Sport and Exercise Psychology":
                "School of Science and Technology",
                "Psychology":
                "School of Social Sciences",
                "Applied Child Psychology":
                "School of Social Sciences",
                "sychological Research Methods":
                "School of Social Sciences",
                "Forensic Mental Health":
                "School of Social Sciences",
                "Forensic Psychology (BPS accredited)":
                "School of Social Sciences",
                "Cyberpsychology":
                "School of Social Sciences",
                "Psychology in Clinical Practice":
                "School of Social Sciences",
                "Psychological Wellbeing and Mental Health":
                "School of Social Sciences",
                "Criminology":
                "School of Social Sciences",
                "Sociology":
                "School of Social Sciences",
                "Politics":
                "School of Social Sciences",
                "International Relations":
                "School of Social Sciences",
                "Online International Relations (Distance learning)":
                "School of Social Sciences",
                "Public Health":
                "School of Social Sciences",
                "Career Development":
                "School of Social Sciences",
                "Social Work (January 2019 entry)":
                "School of Social Sciences",
            }
            item['department'] = departmentDict.get(item['programme_en'])
            if item['department'] == None:
                item['department'] = departmentDict.get(item['programme_en'])
                if item['department'] == None:
                    item['department'] = departmentDict.get(
                        item['programme_en'])
                    if item['department'] == None:
                        item['department'] = departmentDict.get(
                            item['programme_en'].replace(" ", " "))
            print("item['department'] = ", item['department'])

            # School of Animal, Rural and Environmental Sciences
            # School of Architecture, Design and the Built Environment
            # School of Art &amp; Design
            # School of Arts and Humanities
            # Nottingham Business School
            # Nottingham Institute of Education
            # Nottingham Law School
            # School of Science and Technology
            # School of Social Sciences
            if item['department'] is None:
                if "/animal-rural-environmental-sciences" in item['url']:
                    item[
                        'department'] = "School of Animal, Rural and Environmental Sciences"
                elif "/architecture-design-built-environment" in item['url']:
                    item[
                        'department'] = "School of Architecture, Design and the Built Environment"
                elif "/art-design" in item['url']:
                    item['department'] = "School of Art & Design"
                elif "/arts-humanities" in item['url']:
                    item['department'] = "School of Arts and Humanities"
                elif "/business" in item['url']:
                    item['department'] = "Nottingham Business School"
                elif "/education" in item['url']:
                    item['department'] = "Nottingham Institute of Education"
                elif "/law" in item['url']:
                    item['department'] = "Nottingham Law School"
                elif "/science-technology" in item['url']:
                    item['department'] = "School of Science and Technology"
                elif "/social-sciences" in item['url']:
                    item['department'] = "School of Social Sciences"
            print("item['department']1 = ", item['department'])

            if item['degree_name'] == "BA (Hons)":
                item['ielts'] = 7.0
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.5
                item['ielts_w'] = 6.5
            elif item['department'] == "School of Art & Design" or item[
                    'department'] == "School of Animal, Rural and Environmental Sciences" or item[
                        'department'] == "School of Science and Technology":
                item['ielts'] = 6.0
                item['ielts_l'] = 5.5
                item['ielts_s'] = 5.5
                item['ielts_r'] = 5.5
                item['ielts_w'] = 5.5
            elif item['department'] == "Nottingham Business School":
                item['ielts'] = 6.5
                item['ielts_l'] = 5.5
                item['ielts_s'] = 5.5
                item['ielts_r'] = 5.5
                item['ielts_w'] = 5.5
            elif item['department'] == "School of Architecture, Design and the Built Environment" or item[
                    'department'] == "School of Arts and Humanities" or item[
                        'department'] == "Nottingham Institute of Education" or item[
                            'department'] == "Nottingham Law School" or item[
                                'department'] == "School of Social Sciences" or item[
                                    'department'] == "School of Art & Design":
                item['ielts'] = 6.5
                item['ielts_l'] = 5.5
                item['ielts_s'] = 5.5
                item['ielts_r'] = 5.5
                item['ielts_w'] = 5.5
            # print("item['IELTS'] = %s item['IELTS_L'] = %s item['IELTS_S'] = %s item['IELTS_R'] = %s item['IELTS_W'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            entry_requirements = response.xpath(
                "//div[@id='entry-requirements-1']//text()").extract()
            entry_requirementsStr = ''.join(entry_requirements)
            ielts = re.findall(r"IELTS.{1,200}", entry_requirementsStr)
            item['ielts_desc'] = ''.join(ielts)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            if item['ielts'] == None:
                ieltsDict = get_ielts(''.join(ielts))
                item['ielts'] = ieltsDict.get("IELTS")
                item['ielts_l'] = ieltsDict.get("IELTS_L")
                item['ielts_s'] = ieltsDict.get("IELTS_S")
                item['ielts_r'] = ieltsDict.get("IELTS_R")
                item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['IELTS'] = %sitem['IELTS_L'] = %sitem['IELTS_S'] = %sitem['IELTS_R'] = %sitem['IELTS_W'] = %s==" % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            item["require_chinese_en"] = remove_class(
                clear_lianxu_space([
                    """<h2>Entry requirements</h2>
                        <table id="table76765" style="width: 100%;"><thead><tr><th id="table76765r1c1"> Your qualification</th><th id="table76765r1c2"> You could study</th></tr></thead><tbody><tr><td headers="table76765r1c1">         High School Year 2<br />Grades of 70% and above       </td><td headers="table76765r1c2">         Foundation courses at <a href="https://www.kaplanpathways.com/colleges/nottingham-trent-international-college/courses/">Nottingham Trent International College (NTIC) </a></td></tr><tr><td headers="table76765r1c1">         High School Year 3<br />Grades of 80% and above       </td><td headers="table76765r1c2">         International Year One courses at NTIC       </td></tr><tr><td headers="table76765r1c1">         Completion of first year of Chinese university degree       </td><td headers="table76765r1c2">         First year bachelors degrees       </td></tr><tr><td headers="table76765r1c1">         Three year diploma or higher national diploma       </td><td headers="table76765r1c2">         Considered for final year entry to selected bachelors degrees or for Pre-Masters courses at <a href="https://www.kaplanpathways.com/colleges/nottingham-trent-international-college/courses/">Nottingham Trent International College</a></td></tr><tr><td headers="table76765r1c1">         Bachelors degree (four years or six years in medicine / dentistry) from recognised institution in China. <br />Grades of 75% or above<br />Grades of 70% or above from 211 universities       </td><td headers="table76765r1c2">         Postgraduate (Masters) courses       </td></tr><tr><td headers="table76765r1c1">         Masters degree from a recognised institution in China.<br />Grades of 70% or above       </td><td headers="table76765r1c2">         Postgraduate research       </td></tr></tbody></table><p>If you have questions about your qualification and it is not listed here, please <a href="mailto:[email protected]">contact us</a> for advice.</p>
"""
                ]))

            ucascode = response.xpath(
                "//strong[contains(text(),'UCAS code(s):')]/following-sibling::*//text()"
            ).extract()
            clear_space(ucascode)
            if len(ucascode) > 0:
                item['ucascode'] = ''.join(ucascode[0]).replace(" / ",
                                                                "/").strip()
            print("item['ucascode']: ", item['ucascode'])

            duration = response.xpath(
                "//strong[contains(text(),'Course duration:')]/following-sibling::span//text()"
            ).extract()
            print("duration: ", duration)
            duration_str = ''.join(duration).replace("/ sandwich", "").strip()
            duration_list = getIntDuration(''.join(duration))
            # print("duration_list: ", duration_list)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            duration_per = item['duration_per']
            print("item['duration'] = ", item['duration'])
            print("item['duration_per'] = ", item['duration_per'])

            if "/" in item['ucascode']:
                ucascode_sp = item['ucascode'].split("/")
                if "/" in duration_str:
                    duration_sp = duration_str.split("/")
                elif " or" in duration_str:
                    duration_sp = duration_str.split(" or")
                elif "," in duration_str:
                    duration_sp = duration_str.split(" or")
                else:
                    duration_sp = [duration_str, duration_str]
                print("ucascode_sp: ", ucascode_sp)
                print("duration_sp: ", duration_sp)
                if len(ucascode_sp) == 2:
                    item['ucascode'] = ucascode_sp[0]
                    duration_list = getIntDuration(duration_sp[0])
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                    if item['duration'] == None:
                        item['duration'] = int(duration_sp[0].strip())
                        item['duration_per'] = duration_per
                    print("item['ucascode']1: ", item['ucascode'])
                    print("item['duration']1 = ", item['duration'])
                    print("item['duration_per']1 = ", item['duration_per'])
                    yield item

                    item['ucascode'] = ucascode_sp[-1]
                    duration_list = getIntDuration(duration_sp[-1].strip())
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                    if item['duration'] == None:
                        item['duration'] = int(duration_sp[-1].replace(
                            "year", "").replace("(s)", "").strip())
                        item['duration_per'] = 1
                    print("item['ucascode']2: ", item['ucascode'])
                    print("item['duration']2 = ", item['duration'])
                    print("item['duration_per']2 = ", item['duration_per'])
                    yield item
            else:
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)