示例#1
0
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Southampton Solent University"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:
            item['alevel'] = None
            alevel = response.xpath(
                "//h4[contains(text(),'UCAS tariff points from A-levels')]//text()|"
                "//li[contains(text(), 'A-level')]//text()|//li[contains(text(), 'A Level')]//text()"
            ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel']: ", item['alevel'])
            if item['alevel'] == "":
                print("****111")

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "St George's, University of London"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:
                entry_url = response.xpath("//a[contains(text(),'Entry')]/@href").extract()
                # print("entry_url: ", entry_url)
                if len(entry_url) != 0:
                    parse_entry_url = "https://www.sgul.ac.uk" + entry_url[0]
                    # print("parse_entry_url: ", parse_entry_url)
                    entry_dict = self.parse_rntry_requirements(parse_entry_url)
                    # print(entry_dict)
                    # item['rntry_requirements'] = entry_dict.get('rntry_requirements')

                    item['alevel'] = entry_dict.get('alevel')
                    item['ib'] = entry_dict.get('ib')
                print("item['alevel']: ", item['alevel'])
                print("item['ib']: ", item['ib'])

                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
示例#3
0
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Bolton"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:
            # ucas_point = response.xpath("//li[@class='iconim points']//b[contains(text(),'UCAS points:')]/../span//text()").extract()
            # print("ucas_point: ", ucas_point)

            alevel = response.xpath(
                "//li[@class='iconim points']//b[contains(text(),'UCAS points:')]/../span//text()"
            ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel']: ", item['alevel'])

            yield item

        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Queen's University Belfast"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:

            alevel = response.xpath(
                "//b[contains(text(),'A level requirements')]/..//text()"
            ).extract()
            if len(alevel) > 0:
                item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//b[contains(text(),'International Baccalaureate Diploma')]/..//text()"
            ).extract()
            item['ib'] = clear_lianxu_space(ib)
            print("item['ib'] = ", item['ib'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'w',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常:", str(e))
            print("报错url:", response.url)
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Newcastle University"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:
            alevel = response.xpath(
                "//h3[@class='expandable-is-set'][contains(text(),'A Levels')]/../../div")
            print(alevel)
            if len(alevel) > 0:
                item['alevel'] = clear_lianxu_space(alevel[0].xpath("..//text()").extract())
            print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//h3[@class='expandable-is-set'][contains(text(),'International Baccalaureate')]/../../div//text()").extract()
            item['ib'] = clear_lianxu_space(ib)
            print("item['ib'] = ", item['ib'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "The University of Manchester"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])

        try:
            # alevel = response.xpath(
            #     "//h3[@id='a-level']/following-sibling::*[1]//text()").extract()
            alevel = response.xpath(
                "//h3[@id='a-level']/following-sibling::*[1]//text()").extract()
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//h3[@id='international-baccalaureate']/following-sibling::*[1]//text()").extract()
            item['ib'] = clear_lianxu_space(ib)
            print("item['ib'] = ", item['ib'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/"+item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================")
            print("异常:", str(e))
            print("报错url:", response.url)
示例#7
0
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "King's College London"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:

            alevel = response.xpath(
                # "//table//*[contains(text(),'A-level')]/../..//text()|//table//*[contains(text(),'A-Level')]/../..//text()"
                "//div[@class='further-information']//table//tbody/tr[1]//text()"
            ).extract()
            # if len(alevel) == 0:
            #     alevel = response.xpath(
            #         "//strong[contains(text(),'A-Level')]/../following-sibling::td[1]//text()").extract()
            #     if len(alevel) == 0:
            #         alevel = response.xpath(
            #             "//div[@class='requirements EntryReqs_UKALevel clearfix']//div[@class='required-grades']//text()//text()").extract()
            # clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = clear_lianxu_space(alevel)
                # print("item['alevel']1 = ", item['alevel'])

            print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                # "//*[contains(text(),'International Baccalaureate')]/../..//text()"
                "//div[@class='further-information']//table//tbody/tr[8]//text()"
            ).extract()
            # if len(ib) == 0:
            #     ib = response.xpath(
            #         "//*[contains(text(),'International Baccalaureate')]/../../..//text()").extract()
            if len(ib) > 0:
                item['ib'] = clear_lianxu_space(ib)
            print("item['ib'] = ", item['ib'])

            if "All candidates" in item['alevel']:
                alevel = response.xpath(
                    "//div[@class='further-information']//table//tbody/tr[2]//text()"
                ).extract()
                item['alevel'] = clear_lianxu_space(alevel)

                ib = response.xpath(
                    "//b[contains(text(),'International Baccalaureate')]/../../..//text()"
                ).extract()
                item['ib'] = clear_lianxu_space(ib)

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常:", str(e))
            print("报错url:", response.url)
示例#8
0
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Surrey"
        item['url'] = response.meta['url']
        print("===============================")
        print(response.url)
        print(response.meta['url'])

        try:
            tuition_fee = response.xpath(
                "//div[@id='fees']//tbody//tr[1]/td[last()-1]//text()"
            ).extract()
            # print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))

            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
            else:
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee'] = ", item['tuition_fee'])
            print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            alevel = response.xpath(
                "//h3[contains(text(),'A-level')]/..//text()").extract()
            # alevel_str = ''.join(alevel).strip()
            # if alevel_str == "Overall:" or alevel_str == "Overall":
            #     alevel = response.xpath("//h3[contains(text(),'A-level')]/following-sibling::*[position()<4]//text()").extract()
            #     alevel_str = ''.join(alevel).replace("Overall", "").strip().strip(":").strip()
            # print("***alevel")
            item['alevel'] = clear_lianxu_space(alevel)
            # print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//h3[contains(text(),'International Baccalaureate')]/..//text()"
            ).extract()
            # ib_str = ''.join(ib).strip()
            # if ib_str == "Overall:":
            #     ib = response.xpath("//h3[contains(text(),'International Baccalaureate')]/following-sibling::*[2]//text()").extract()
            #     ib_str = ''.join(ib).strip()
            #     # print("***ib")
            item['ib'] = clear_lianxu_space(ib)
            # print("item['ib'] = ", item['ib'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Surrey"
        item['url'] = response.meta['url']
        # 学位类型
        # item['degree_type'] = 1
        # item['location'] = '01SE01, Senate House, University of Surrey, Guildford, Surrey GU2 7XH'
        # print("item['location'] = ", item['location'])
        print("===============================")
        print(response.url)
        print(response.meta['url'])

        try:

            alevel = response.xpath(
                "//h3[contains(text(),'A-level')]/..//text()").extract()
            # alevel_str = ''.join(alevel).strip()
            # if alevel_str == "Overall:" or alevel_str == "Overall":
            #     alevel = response.xpath("//h3[contains(text(),'A-level')]/following-sibling::*[position()<4]//text()").extract()
            #     alevel_str = ''.join(alevel).replace("Overall", "").strip().strip(":").strip()
            # print("***alevel")
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//h3[contains(text(),'International Baccalaureate')]/..//text()"
            ).extract()
            # ib_str = ''.join(ib).strip()
            # if ib_str == "Overall:":
            #     ib = response.xpath("//h3[contains(text(),'International Baccalaureate')]/following-sibling::*[2]//text()").extract()
            #     ib_str = ''.join(ib).strip()
            #     # print("***ib")
            item['ib'] = clear_lianxu_space(ib)
            print("item['ib'] = ", item['ib'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Canterbury Christ Church University"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:
            # ucas_point = response.xpath("//li[@class='iconim points']//b[contains(text(),'UCAS points:')]/../span//text()").extract()
            # print("ucas_point: ", ucas_point)

            alevel = response.xpath(
                "//h3[contains(text(),'Entry requirements')]/following-sibling::*//*[contains(text(), 'A level')]//text()|"
                "//h3[contains(text(),'Entry requirements')]/following-sibling::*//*[contains(text(), ' UCAS Tariff points')]//text()|"
                "//h3[contains(text(),'Entry requirements')]/following-sibling::*//*[contains(text(), 'UCAS points')]//text()|"
                "//h3[contains(text(),'Entry requirements')]/following-sibling::*//*[contains(text(), 'A typical offer')]//text()"
            ).extract()
            # del_re = re.findall(r"More entry requirement details.*", ''.join(alevel))
            # print("del_re: ", del_re)
            item['alevel'] = clear_lianxu_space(alevel).replace(
                "More entry requirement details", "").replace(".", "").strip()
            print("item['alevel']: ", item['alevel'])

            # ib = response.xpath(
            #     "//h5[contains(text(),'EU/International students')]/following-sibling::table//td[contains(text(),'International Baccalaureate')]/following-sibling::td//text()|"
            #     "//p[contains(text(),'International Baccalaureate')]//text()|"
            #     "//strong[contains(text(),'International Baccalaureate:')]/../span//text()").extract()
            # if len(ib) == 0:
            #     ib = response.xpath(
            #         "//td[contains(text(),'International Baccalaureate')]/following-sibling::td//text()").extract()
            # item['ib'] = clear_lianxu_space(ib)
            # print("item['ib']: ", item['ib'])

            yield item

        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Bath Spa University"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:
            item['alevel'] = None
            alevel = response.xpath(
                "//h3[@class='title'][contains(text(),'Typical offers')]/following-sibling::div[1]//span[contains(text(),'A Level')]/..//text()|"
                "//h3[@class='title'][contains(text(),'Typical offers')]/../following-sibling::div[1]//li[contains(text(),'A Level')]//text()|"
                "//h3[@class='title'][contains(text(),'Typical offers')]/following-sibling::div[1]//span[contains(text(),'A-level')]/..//text()|"
                "//h3[@class='title'][contains(text(),'Typical offers')]/following-sibling::div[1]//span[contains(text(),'A-Level')]/..//text()"
                "//h3[@class='title'][contains(text(),'Typical offers')]/following-sibling::div[1]//li[contains(text(),'A-level')]//text()|"
                "//h3[@class='title'][contains(text(),'Typical offers')]/following-sibling::div[1]//li[contains(text(),'A Level')]//text()"
            ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel']: ", item['alevel'])
            if item['alevel'] == "":
                print("****alevel")

            item['ib'] = None
            ib = response.xpath(
                "//h3[@class='title'][contains(text(),'Typical offers')]/following-sibling::div[1]//span[contains(text(),'International Baccalaureate')]/..//text()|"
                "//h3[@class='title'][contains(text(),'Typical offers')]/following-sibling::div[1]//li[contains(text(),'International Baccalaureate')]//text()"
            ).extract()
            item['ib'] = ' '.join(ib).strip()
            print("item['ib']: ", item['ib'])
            if item['ib'] == "":
                print("****ib")

            yield item

        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Reading"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:
            alevel = response.xpath(
                "//strong[contains(text(),'International Baccalaureate')]/../preceding-sibling::*[position()<last()]//text()"
            ).extract()
            if len(alevel) == 0:
                alevel = response.xpath(
                    "//h4[contains(text(),'Typical')]/following-sibling::*[1]//text()|//h4[contains(text(),'A level')]/following-sibling::*[1]//text()"
                ).extract()
            item['alevel'] = ''.join(alevel).strip()
            if item['alevel'] == "":
                print("alevel 为空")
            print("item['alevel']: ", item['alevel'])

            ib = response.xpath(
                "//h4[contains(text(),'International Baccalaureate')]/following-sibling::*[1]//text()|"
                "//strong[contains(text(),'International Baccalaureate')]/../following-sibling::*[1]//text()"
            ).extract()
            item['ib'] = ''.join(ib).strip()
            # if item['ib'] == "":
            #     print("ib 为空")
            print("item['ib']: ", item['ib'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
示例#13
0
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Cardiff Metropolitan University"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:
            item['alevel'] = None
            # alevel = response.xpath(
            #     "//*[contains(text(),'A levels')]//text()|"
            #     "//*[contains(text(),'A Levels')]//text()").extract()
            alevel = response.xpath(
                "//h3[contains(text(), 'Entry Requirement')]/following-sibling::div[1]//*[contains(text(),'Degree')]/../following-sibling::p[1]//text()|"
                "//h3[contains(text(), 'Entry Requirement')]/following-sibling::div[1]//*[contains(text(),'Degree')]/..//following-sibling::ul[1]/li[1]//text()"
            ).extract()
            if len(alevel) == 0:
                alevel = response.xpath(
                    "//h3[contains(text(), 'Entry Requirement')]/following-sibling::div[1]/p[1]//text()|"
                    "//h3[contains(text(), 'Entry Requirement')]/following-sibling::div[1]//ul[1]/li[1]//text()"
                ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel']: ", item['alevel'])

            # ib = response.xpath(
            #     "//strong[contains(text(),'International Baccalaureate:')]/../text()").extract()
            # item['ib'] = clear_lianxu_space(ib)
            # print("item['ib']: ", item['ib'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
示例#14
0
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Loughborough University"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:

            tuition_fee = response.xpath(
                "//span[@class='list__text'][contains(text(),'International fee')]/../following-sibling::dd//text()").extract()
            clear_space(tuition_fee)
            # print('tuition_fee: ', tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee_pre'] = '£'
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])
            print("item['tuition_fee']: ", item['tuition_fee'])

            alevel = response.xpath(
                "//dt[@class='list__item list__item--term'][contains(text(),'A-Level')]/following-sibling::dd//text()").extract()
            # alevel = response.xpath(
            #     "//span[@class='list__text'][contains(text(),'Typical offer')]/../following-sibling::dd//text()").extract()
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[0]).strip()
            print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//dt[@class='list__item list__item--term'][contains(text(),'IB')]/following-sibling::dd//text()").extract()
            item['ib'] = ''.join(ib).strip()
            print("item['ib'] = ", item['ib'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/"+item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Portsmouth"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:
            rntry_requirements_content = response.xpath(
                "//div[contains(text(),'Entry Requirements')]/../../..//div[contains(text(),'2019 start')]/../../../..//text()"
            ).extract()
            rntry_requirements_str = clear_lianxu_space(
                rntry_requirements_content)

            alevel = response.xpath(
                "//*[contains(text(),'A level')]/text()").extract()
            print("====", alevel)
            if len(alevel) == 0:
                alevel = re.findall(r".{1,45}A\slevels.{1,85}",
                                    rntry_requirements_str)
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[1:]).strip()
                if item['alevel'] == "":
                    item['alevel'] = ''.join(alevel).strip()
            print("item['alevel']: ", item['alevel'])

            # item["ib"] = "Most courses will require between 24 and 31 points in the International Baccalaureate (IB), depending on the degree you apply for."
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
示例#16
0
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Aston University"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:

            alevel = response.xpath(
                "//div[contains(text(),'A level')]/..//text()|//div[contains(text(),'B – B')]/..//text()|"
                "//div[@class='course-details__dt'][contains(text(),'A Level')]/..//text()|"
                "//div[contains(text(),'A-levels')]/..//text()").extract()
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel']: ", item['alevel'])

            ib = response.xpath(
                "//div[contains(text(),'International baacalaureate diploma')]/..//text()|"
                "//div[contains(text(),'International Baccalaureate')]/..//text()|"
                "//div[contains(text(),'International baccalaureate')]/..//text()"
            ).extract()
            if len(ib) == 0:
                ib = response.xpath(
                    "//strong[contains(text(),'International Baccalaureate')]/../following-sibling::*[1]//text()"
                ).extract()
            item['ib'] = clear_lianxu_space(ib)
            print("item['ib']: ", item['ib'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "The University of Edinburgh"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])

        try:
            alevel = response.xpath(
                "//div[@id='proxy_collapseEntry']//li[contains(text(),'A Levels:')]//text()|//div[@id='proxy_collapseEntry']//p[contains(text(),'A levels:')]//text()"
            ).extract()
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[-1]).strip()
            print("item['alevel'] = ", item['alevel'])

            # ib = response.xpath(
            #     "//html//ul[1]/li[3]/abbr[contains(text(),'IB')]/..//text()|//p[contains(text(),'IB:')]//text()").extract()
            ib = response.xpath(
                "//html//ul[3]/li[3]/abbr[contains(text(),'IB')]/..//text()|//div[@id='proxy_collapseEntry']//p[contains(text(),'IB:')]//text()"
            ).extract()
            clear_space(ib)
            if len(ib) > 0:
                item['ib'] = ''.join(ib).strip()
            print("item['ib'] = ", item['ib'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常:", str(e))
            print("报错url:", response.url)
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Wales Trinity Saint David"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:
            # ucas_point = response.xpath("//li[@class='iconim points']//b[contains(text(),'UCAS points:')]/../span//text()").extract()
            # print("ucas_point: ", ucas_point)

            alevel = response.xpath(
                "//div[@id='collapseEntryCriteria']//*[contains(text(),'UCAS points')]//text()|"
                "//div[@id='collapseEntryCriteria']//*[contains(text(),'A Level')]//text()|"
                "//div[@id='collapseEntryCriteria']//*[contains(text(),'A level')]//text()"
            ).extract()
            if len(alevel) == 0:
                alevel = response.xpath(
                    "//*[contains(text(),'UCAS Points')]//text()|"
                    "//*[contains(text(),'UCAS points')]//text()|"
                    "//ul[@type='disc']/preceding-sibling::*[1]//text()|//ul[@type='disc']//text()"
                ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel']: ", item['alevel'])

            yield item

        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Salford"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:

            alevel = response.xpath(
                "//*[contains(text(),'A level')]/following-sibling::td//text()"
            ).extract()
            if len(alevel) == 0:
                alevel = response.xpath(
                    "//*[contains(text(),'UCAS tariff points')]/following-sibling::td//text()"
                ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel']: ", item['alevel'])

            ib = response.xpath(
                "//*[contains(text(),'International Baccalaureate')]/following-sibling::td//text()"
            ).extract()
            item['ib'] = clear_lianxu_space(ib)
            print("item['ib']: ", item['ib'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
示例#20
0
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.port.ac.uk/"
        item['university'] = "University of Portsmouth"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item[
            'location'] = 'University House, Winston Churchill Avenue, Portsmouth PO1 2UP'
        print("===========================")
        print(response.url)
        try:
            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            programme = response.xpath("//h1[@class='Title']/text()").extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            degree_type = response.xpath(
                "//h1[@class='Title']/small//text()").extract()
            item['degree_name'] = ''.join(degree_type).replace("(Hons)",
                                                               "").strip()
            print("item['degree_name']: ", item['degree_name'])

            ucascode = response.xpath(
                "//nobr[contains(text(),'UCAS Code')]/../following-sibling::*//text()"
            ).extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).replace("UCAS Code:",
                                                         "").strip()
            # print("item['ucascode'] = ", item['ucascode'])

            item['start_date'] = response.meta.get(response.url)
            # print("item['start_date'] = ", item['start_date'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            # department = response.xpath(
            #     "//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()").extract()
            # item['department'] = ''.join(department)
            # print("item['department']: ", item['department'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            duration = response.xpath(
                "//div[contains(text(),'Duration')]/following-sibling::*//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_str = ''.join(duration)
            item['other'] = duration_str

            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            location = response.xpath(
                "//div[contains(text(),'Location')]/following-sibling::*//text()"
            ).extract()
            item['location'] = ''.join(location)
            # print("item['location']: ", item['location'])

            # //strong[contains(text(),'International students')]/../following-sibling::p[1]
            tuition_fee = response.xpath(
                "//h3[contains(text(),'Tuition fees')]/..//*[contains(text(),'International students')]//text()"
            ).extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"£\d+,\d+", ''.join(tuition_fee))
            # print("tuition_fee_re: ", tuition_fee_re)
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = int(tuition_fee_re[0].replace(
                    ",", "").replace("£", "").strip())
            # print("item['tuition_fee']: ", item['tuition_fee'])

            overview = response.xpath(
                """//h2[@id='overview']/..|//h3[contains(text(),"What you'll experience")]/..|
            //h3[contains(text(),'What you’ll experience')]/..|//*[contains(text(),"What you'll experience")]/../.."""
            ).extract()
            item['overview_en'] = remove_class(
                clear_lianxu_space(overview)).replace(
                    "<h3>What you'll experience</h3>", "").strip()
            print("item['overview_en']: ", item['overview_en'])

            career = response.xpath(
                "//h3[contains(text(),'Careers and opportunities')]/.."
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            rntry_requirements_content = response.xpath(
                "//div[contains(text(),'Entry Requirements')]/../../..//div[contains(text(),'2019 start')]/../../../..//text()"
            ).extract()
            rntry_requirements_str = clear_lianxu_space(
                rntry_requirements_content)

            ieltsList = response.xpath(
                "//*[contains(text(),'English language proficiency')]/text()|"
                "//*[contains(text(),'English Language proficiency')]/text()"
            ).extract()
            # print(ieltsList)
            if len(ieltsList) == 0:
                ieltsList = re.findall(r".{1,45}IELTS.{1,85}",
                                       rntry_requirements_str)
            clear_space(ieltsList)
            if len(ieltsList) > 0:
                item['ielts_desc'] = ''.join(ieltsList[1:]).strip()
                if item['ielts_desc'] == "":
                    item['ielts_desc'] = ''.join(ieltsList).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            alevel = response.xpath(
                "//*[contains(text(),'A levels')]/text()").extract()
            # print(ieltsList)
            if len(alevel) == 0:
                alevel = re.findall(r".{1,45}A\slevels.{1,85}",
                                    rntry_requirements_str)
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[1:]).strip()
                if item['alevel'] == "":
                    item['alevel'] = ''.join(alevel).strip()
            print("item['alevel']: ", item['alevel'])

            modules = response.xpath(
                "//h2[@id='What youll study']/..|//h2[@id='What youll study']/../following-sibling::div[1]|//div[contains(text(),'Units currently being studied')]/../../.."
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            teaching_assessment = response.xpath(
                "//h2[@id='Teaching']/..|//h2[@id='Teaching']/../following-sibling::*[1]|//h2[@id='How youre assessed']/..|//h2[@id='How youre assessed']/../following-sibling::*[1]"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(teaching_assessment))
            # print("item['assessment_en']: ", item['assessment_en'])

            apply_proces_en = response.xpath(
                "//h2[@id='Apply']/..|//h2[@id='Apply']/../following-sibling::*"
            ).extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(apply_proces_en))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            item['apply_documents_en'] = remove_class(
                clear_lianxu_space([
                    """<h2 style="color: #384047; margin: 33px 0px 0.7em; padding: 0px;">What you'll need to send us</h2>
<p style="color: #384047; margin: 0px 0px 25px; border: none;">When you apply to join us, we'll need to see the following documents:</p>
<ul style="color: #384047; margin: 0px 0px 25px; padding-left: 35px; border: none; list-style-image: initial;">
    <li style="margin-top: 0px;">A completed application form</li>
    <li>A Personal Statement or Statement of Purpose</li>
    <li>Officially certified and translated copies of your high school or college qualification and grades (for undergraduate courses)</li>
    <li>Officially certified and translated copies of your degree qualification and grades (for Postgraduate courses)</li>
    <li>Proof of your English language level (such as an IELTS Certificate)</li>
    <li style="margin-bottom: 0px;">One academic reference on official headed paper for undergraduate courses or two references for postgraduate courses</li>
</ul>"""
                ]))
            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h3>Undergraduate courses</h3>
<p>If you've completed the Chinese Senior High School Diploma plus one year at a recognised university in China, we'll consider you for admission onto an undergraduate course such as a Bachelor's degree. You must have studied relevant subjects and achieved strong grades.</p>
<p>If you don't have a Chinese Senior High School Diploma, you can apply with:</p>
<h4>A levels</h4>
<ul>
    <li>Most courses will require 120 UCAS points. Your A level grades should equal or exceed the total points required. You can use the&nbsp;<a rel="noopener noreferrer" rel="noopener noreferrer" href="https://www.ucas.com/ucas/tariff-calculator"></a><a rel="noopener noreferrer" href="https://www.ucas.com/ucas/tariff-calculator" target="_blank">UCAS Tariff Calculator</a>&nbsp;to work out your total points. Please check your specific course page to find the exact number of points.</li>
    <li>Some courses will require you to have studied specific subjects at A level. For example, to study a science course you will usually need to have achieved passing grades in scientific subjects at A level.</li>
    <li>A level points: A* = 56 A = 48 B = 40 C = 32 D = 24.</li>
</ul>
<h4>International Baccalaureate</h4>
<ul>
    <li>Most courses will require between 24 and 31 points in the International Baccalaureate (IB), depending on the degree you apply for.</li>
</ul>
<p>You may also be considered for advanced entry onto a relevant undergraduate degree programme if you have a College Graduation Diploma (Dazhuan) from a recognised university or college on completion of two to three years of study, or a BTEC HND or SQA HND Higher National Diploma in a relevant subject.</p>
<p>You may be able to join an undergraduate course with other qualifications. We do consider qualifications from a range of sources. Contact us to find out more.</p>"""
                ]))
            item[
                "ib"] = "Most courses will require between 24 and 31 points in the International Baccalaureate (IB), depending on the degree you apply for."
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
示例#21
0
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "The University of Edinburgh"
        # item['country'] = 'England'
        # item['website'] = 'https://www.ed.ac.uk/'
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            # 专业
            programme = response.xpath(
                "//h1[@itemprop='headline']//text()").extract()
            clear_space(programme)
            programme_en = ''.join(programme).strip()

            degree_name = re.findall(r"^.*?\s", programme_en)
            if len(degree_name) > 0:
                item['degree_name'] = degree_name[0].strip()
            print("item['degree_name']: ", item['degree_name'])

            item['programme_en'] = programme_en.replace(
                item['degree_name'], '').strip()
            print("item['programme_en']: ", item['programme_en'])

            department = response.xpath(
                "//div[@id='proxy_rightSummary']//p//span[contains(text(),'College:')]/../text()"
            ).extract()
            clear_space(department)
            item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            ucascode = response.xpath(
                "//span[contains(text(),'UCAS code:')]/../text()").extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            # print("item['ucascode']: ", item['ucascode'])

            duration = response.xpath(
                "//span[contains(text(),'Duration:')]/../text()").extract()
            clear_space(duration)
            # print("duration: ", duration)

            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            # //div[@class='col-xs-12']//div[@class='row']//div[@class='col-xs-12']//ul[@class='addressList']//li[@class='contactCampus']
            # location = response.xpath(
            #     "//div[@class='col-xs-12']//div[@class='row']//div[@class='col-xs-12']//ul[@class='addressList']//li[@class='contactCampus']/text()").extract()
            # clear_space(location)
            item[
                'location'] = '33 Buccleuch Place, City, Edinburgh, Post Code. EH8 9JS'
            # print("item['location']: ", item['location'])

            # # //option[@value='0010']
            # start_date = response.xpath(
            #     "//select[@name='code2']//option//text()").extract()
            # clear_space(start_date)
            # if len(start_date) > 1:
            #     item['start_date'] = start_date[0].strip()
            # # print("item['start_date']: ", item['start_date'])
            # item['start_date'] = getStartDate(item['start_date'])
            # print("item['start_date'] = ", item['start_date'])

            overview = response.xpath(
                "//div[@id='proxy_introduction']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            # //div[@id='proxy_collapseprogramme']
            modules = response.xpath(
                "//div[@id='proxy_collapseWhatStudy']/..").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(
                list(modules)))
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath(
                "//div[@id='proxy_collapseLearning']/..").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            career = response.xpath(
                "//div[@id='proxy_collapseCareers']/..").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            # //div[@id='proxy_collapseentry_req']
            # entry_requirements = response.xpath(
            #     "//div[@id='proxy_collapseentry_req']/..//text()").extract()
            # item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            alevel = response.xpath(
                "//li[contains(text(),'A Levels:')]//text()|//p[contains(text(),'A levels:')]//text()"
            ).extract()
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[-1]).strip()
            print("item['alevel'] = ", item['alevel'])

            # ib = response.xpath(
            #     "//html//ul[1]/li[3]/abbr[contains(text(),'IB')]/..//text()|//p[contains(text(),'IB:')]//text()").extract()
            ib = response.xpath(
                "//html//ul[3]/li[3]/abbr[contains(text(),'IB')]/..//text()|//p[contains(text(),'IB:')]//text()"
            ).extract()
            clear_space(ib)
            if len(ib) > 0:
                item['ib'] = ''.join(ib).strip()
            print("item['ib'] = ", item['ib'])

            IELTS = response.xpath(
                "//abbr[contains(text(),'IELTS')]/..//text()").extract()
            item['ielts_desc'] = ''.join(IELTS)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            TOEFL = response.xpath(
                "//abbr[contains(text(),'TOEFL')]/..//text()").extract()
            if len(TOEFL) == 0:
                TOEFL = response.xpath(
                    "//*[contains(text(),'TOEFL')]//text()").extract()
            item['toefl_desc'] = ''.join(TOEFL)
            # print("item['toefl_desc']: ", item['toefl_desc'])

            toeflDict = get_toefl(item['toefl_desc'])
            item['toefl'] = toeflDict.get("TOEFL")
            item['toefl_l'] = toeflDict.get("TOEFL_L")
            item['toefl_s'] = toeflDict.get("TOEFL_S")
            item['toefl_r'] = toeflDict.get("TOEFL_R")
            item['toefl_w'] = toeflDict.get("TOEFL_W")
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            tuition_feeDict = {}
            tuition_fee_url = response.xpath(
                "//html//div[@id='proxy_collapseFees']//p[1]/a/@href").extract(
                )
            print("tuition_fee_url: ", tuition_fee_url)
            if len(tuition_fee_url) > 0:
                tuition_fee_url_str = tuition_fee_url[0]
                fee = self.parse_tuition_fee(tuition_fee_url_str)
                clear_space(fee)
                fee_re = re.findall(r"£\d+,\d+", ''.join(fee))
                print("fee_re: ", fee_re)
                item['tuition_fee'] = getTuition_fee(''.join(fee_re))
                item['tuition_fee_pre'] = "£"
            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
                item['tuition_fee_pre'] = ""
            print("item['tuition_fee']: ", item['tuition_fee'])

            # https://www.ed.ac.uk/studying/international/country/asia/east-asia/china
            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<p class="lead">Undergraduate entry requirements for students from China.</p>


  <h2>Senior High School Certificate</h2>

<p>Students who have completed the Chinese Senior High School Certificate are required to undertake further study for entry to most subjects as this qualification does not normally meet our minimum entry requirements.</p>

<p>We accept the following qualifications for direct entry to our undergraduate degree programmes:</p>

<ul>
	<li><a class="uoe-node-link uoe-published" href="/studying/undergraduate/entry-requirements/ruk/a-levels" title="A Levels"><abbr title="General Certificate of Education">GCE</abbr> <abbr title="Advanced Level">A Levels</abbr></a></li>
	<li><a class="uoe-node-link uoe-published" href="/studying/undergraduate/entry-requirements/international/ib" >International Baccalaureate</a></li>
	<li><a class="uoe-node-link uoe-published" href="/studying/undergraduate/entry-requirements/scottish-qualifications/highers" title="SQA Highers and Advanced Highers">Scottish qualifications</a></li>
	<li><a class="uoe-node-link uoe-published" href="/studying/international/country/americas/united-states-of-america" title="United States of America"><abbr title="United States">US</abbr> qualifications</a></li>
</ul>

<p>Applicants with qualifications other than those listed above will usually be required to complete a Foundation Year before entering the University.</p>

<p><a class="uoe-node-link uoe-published" href="/studying/international/applying/foundation" title="International Foundation Programme">Foundation year</a></p>

<h2>Science and Engineering</h2>

<p>For degree programmes in Science and Engineering, applicants who have completed a year of study at a leading Chinese University may be eligible to apply.</p>

<p>The College of Science &amp; Engineering will also give consideration to applicants who have achieved excellent results in the Chinese National University Entrance Examination (Gaokao) on an individual basis.</p>

<h2>Further guidance on academic entry requirements</h2>

<p>Each course may have further specific entry requirements. All applicants must meet these requirements. Staff in the Admissions Offices will be able to provide further guidance.</p>

<p><a class="uoe-node-link uoe-published" href="/studying/undergraduate/contacts" title="Contact us with an enquiry about undergraduate study">Undergraduate admissions contacts</a></p>

<h2>English Language requirements</h2>

<p>If your first language is not English, you will also have to meet English Language requirements to apply. These requirements are listed by programme.</p>

<p><a class="uoe-node-link uoe-published" href="/studying/international/english" title="English language requirements">English Language advice</a></p>

<p><a class="uoe-node-link uoe-published" href="/studying/undergraduate/degrees" title="Degree finder">Specific English language requirement by programme</a></p>

<h2>Contact us</h2>

<p>Edinburgh Global's representative for China is Esther Sum.</p>

<p>Esther will help you with admissions advice and support.</p>

<p><a href="mailto:[email protected]">Contact us by email - [email protected]</a></p>

<h2>Support in your country</h2>

<p><a class="uoe-node-link uoe-published" href="/studying/international/application/our-visits-overseas" title="Our visits overseas">View a list of our overseas visits</a></p>

<p><a class="uoe-node-link uoe-published" href="/studying/international/agents/list/china" title="China">Our agents in your country</a></p>

<h2>Chat to us</h2>

<p>Talk to a member of staff online and view a presentation about study in Edinburgh.</p>

<p><a class="uoe-node-link uoe-published" href="/studying/international/application/chat-to-us-online" title="Online information sessions">Chat to us</a></p>

<h2>Join our mailing list</h2>

<p>We will send you further useful information about the University, admissions and entry.</p>

<p><a href="http://r1.dotmailer-surveys.com/0127judf-2e1gig1f">Join our mailing list </a></p>

<h2>About Edinburgh</h2>

<p><a class="uoe-node-link uoe-published" href="/about" title="About">More information about Edinburgh</a></p>

<p><a class="uoe-node-link uoe-published" href="/global/immigration/applying-for-visa/visa-requirements" >Do I need a visa?</a></p>

<h2>Student numbers</h2>

<p>There are almost 3,000 students students from China currently studying at the University of Edinburgh.</p>
"""
                ]))
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            item[
                'apply_proces_en'] = "https://www.ed.ac.uk/studying/undergraduate/applying"

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常:", str(e))
            print("报错url:", response.url)
示例#22
0
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.bathspa.ac.uk/"
        item['university'] = "Bath Spa University"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            item['location'] = 'Bath'
            # 专业、学位类型//div[@class='masthead-inner']/div/div[@class='masthead-content']/h1
            programme = response.xpath(
                "//div[@class='masthead-inner']/div/div[@class='masthead-content']/h1//text()"
            ).extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath(
                "//div[@class='masthead-inner']/div/div[@class='masthead-content']/p[1]//text()"
            ).extract()
            item['degree_name'] = ''.join(degree_type).replace("(Hons)",
                                                               "").strip()
            print("item['degree_name']: ", item['degree_name'])

            # //dt[contains(text(),'School')]/following-sibling::dd[1]
            department = response.xpath(
                "//dt[contains(text(),'School')]/following-sibling::dd[1]//text()"
            ).extract()
            item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            location = response.xpath(
                "//dt[contains(text(),'Campus or location')]/following-sibling::dd[1]//text()"
            ).extract()
            item['location'] = ''.join(location).strip()
            # print("item['location']: ", item['location'])

            modules = response.xpath(
                "//h3[contains(text(),'Course structure')]/..|//h3[contains(text(),'Course modules')]/..|//h2[contains(text(),'Course modules')]/.."
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            career = response.xpath(
                "//h3[contains(text(),'Career')]/..").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            feeContent = response.xpath(
                "//h3[contains(text(),'International students full time')]/../div/table[1]//td[contains(text(), 'Year')]/following-sibling::td//text()"
            ).extract()
            clear_space(feeContent)
            # print(feeContent)
            if len(feeContent) > 0:
                item['tuition_fee'] = int(feeContent[0].replace(
                    "£", "").replace(",", "").strip())
            # print("item['tuition_fee']: ", item['tuition_fee'])

            alevel = response.xpath(
                "//span[contains(text(),'A Level')]/..//text()|//li[contains(text(),'A Level')]//text()"
            ).extract()
            item['alevel'] = ''.join(alevel).strip()
            # print("item['alevel']: ", item['alevel'])
            # if item['alevel'] == "":
            #     print("****alevel")

            ib = response.xpath(
                "//span[contains(text(),'International Baccalaureate')]/..//text()|"
                "//li[contains(text(),'International Baccalaureate')]//text()"
            ).extract()
            item['ib'] = ''.join(ib).strip()
            # print("item['ib']: ", item['ib'])
            # if item['ib'] == "":
            #     print("****ib")

            # //div[@class='content']/div[@class='collapsible-content highlighted']/div[2]/div[2]

            ieltsList = response.xpath(
                "//*[contains(text(),'IELTS')]//text()").extract()
            item['ielts_desc'] = ''.join(ieltsList).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            interview_desc_en = response.xpath(
                "//h3[contains(text(),'Interview and portfolio guidance')]/..|"
                "//h3[contains(text(),'Portfolio and interview')]/..").extract(
                )
            item['interview_desc_en'] = remove_class(
                clear_lianxu_space(interview_desc_en))
            # print("item['interview_desc_en']: ", item['interview_desc_en'])
            # if item['interview_desc_en'] == "":
            #     print("****interview_desc_en")

            portfolio_desc_en = response.xpath(
                "//h3[contains(text(),'Interview and portfolio guidance')]/..|"
                "//h3[contains(text(),'Portfolio')]/..").extract()
            item['portfolio_desc_en'] = remove_class(
                clear_lianxu_space(portfolio_desc_en))
            # print("item['portfolio_desc_en']: ", item['portfolio_desc_en'])
            # if item['portfolio_desc_en'] == "":
            #     print("****portfolio_desc_en")

            # https://www.bathspa.ac.uk/international/country-advice/china/

            item[
                'require_chinese_en'] = "<p><strong>Undergraduate</strong></p><ul><li>Senior Secondary School Graduation Certificate with a grade of 70% and a Foundation Certification from a recognised institution.</li></ul><p><strong>Undergraduate - Year 2 or 3 entry</strong></p><ul><li>Students with a Dazhuan Certificate will be considered for Year 3 entry on an individual basis.&nbsp;</li></ul>"

            # https://www.bathspa.ac.uk/applicants/how-to-apply/postgraduate/
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="intro-text">
	<p class="intro">We’re delighted you’re applying to study with us. The process is different based on your location and mode of study. Here’s what you need to do.</p>
</div><div class="rich-text" >
  <div data-hash-anchor='<a id="d.en.1281"></a>'></div>
    <div>
        <h2>UCAS applicants</h2>
<p>If you fit the following criteria, you’ll need to apply through the Universities and Colleges Admissions Service (UCAS):</p>
<ul>
<li>You’re applying directly out of sixth form or college;</li>
<li>You want to study full-time;</li>
<li>You don’t already hold an undergraduate qualification and are from the UK, EU or Channel Islands.</li>
</ul>
<p><strong>The official UCAS deadline for 2018/19 applications to any course: 15 January 2018.</strong></p>
<p>You’ll need some information from your course's webpage, including Bath Spa University’s institution code: BASPA B20.</p>
<p>Read more about <a href="/applicants/how-to-apply/undergraduate-and-foundation/how-to-apply-through-ucas/">how to </a><a href="/applicants/how-to-apply/undergraduate-and-foundation/how-to-apply-through-ucas/">apply through UCAS</a> or just get started. You’ll need to register or login to the UCAS site. &nbsp;</p>
<p><a href="https://www.ucas.com/ucas/undergraduate/ucas-undergraduate-apply-and-track">Apply via UCAS</a></p>
<h2>International applicants</h2>
<p>You can apply for one of our undergraduate courses online from the course’s webpage.&nbsp;You’ll be asked to create an online account.</p>
<p>Don’t have time to complete your whole application? Don’t worry, you can save your application and come back to it at anytime.</p>
<p>Alternatively, you can also <a href="https://www.ucas.com/ucas/undergraduate/ucas-undergraduate-apply-and-track">apply via UCAS</a>.</p>
<p>Entry requirements are listed on the course pages. As part of the process you will be required to provide evidence to support your application.&nbsp;Please see our <a href="/international/">international</a> webpages for more information for international students, including entry requirements and visa advice specific to your country.</p>
<p><a href="/courses/">Search for your course</a></p>
<h2>Applying for part-time study</h2>
<p>If you’d like to study part-time, you’ll need to apply online directly with us, rather than through UCAS. &nbsp;</p>
<p><strong>Click the 'apply now' button on the webpage for the course you’d like to study.</strong></p>
<h2>Already hold an undergraduate degree?</h2>
<p>If you already have a degree or higher qualification than that for which you are applying, your fee requirements may be different, due to the way government University funding is distributed. Please check the Equivalent or Lower Qualification (ELQ) policy&nbsp;for more details.<br><br>This also applies to students who progress to the third year of study, following completion of a Foundation Degree. Please note that Foundation Degrees are currently exempt from higher fees.</p>
    </div>
</div>"""
                ]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            ucascode = response.xpath(
                "//dd[contains(text(),'Course Code:')]//text()").extract()
            clear_space(ucascode)
            print("ucascode: ", ucascode)
            item['ucascode'] = ''.join(ucascode).replace("Course Code:",
                                                         "").strip()
            print("len: ", len(ucascode))
            print("item['ucascode'] = ", item['ucascode'])

            # duration
            durationMode = response.xpath(
                "//dt[contains(text(),'Course length')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(durationMode)
            print("durationMode: ", durationMode)
            durationMode = ''.join(durationMode)
            duration_list = getIntDuration(durationMode.strip())
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            print("item['duration']: ", item['duration'])
            print("item['duration_per']: ", item['duration_per'])
            item['other'] = durationMode
            print("item['other']: ", item['other'])

            if "or" in item['ucascode']:
                ucascode_list1 = item['ucascode'].split("or")
                print("ucascode_list1", ucascode_list1)

                # 拆分duration
                if ", or" in item['other']:
                    duration_list1 = item['other'].split(", or")
                else:
                    duration_list1 = [item['other'], item['other']]
                print("duration_list1: ", duration_list1)
                for u in range(len(ucascode_list1)):
                    item['ucascode'] = ucascode_list1[u].strip()
                    duration_list = getIntDuration(duration_list1[u].strip())
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                    print("item['duration']: ", item['duration'])
                    print("item['duration_per']: ", item['duration_per'])
                    # 分为两种情况,第一种正常采集,第二种为带实习的专业
                    if u == 0:
                        overview = response.xpath(
                            """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'Overview')]/.."""
                        ).extract()
                        if len(overview) == 0:
                            overview = response.xpath(
                                """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'overview')]/.."""
                            ).extract()
                        item['overview_en'] = remove_class(
                            clear_lianxu_space(overview))
                        # print("item['overview_en']1: ", item['overview_en'])

                        assessment_en = response.xpath(
                            """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'How will I be assessed?')]/..|
                            //h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'How will I be taught?')]/..|
                            //h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'Assessment')]/.."""
                        ).extract()
                        item['assessment_en'] = remove_class(
                            clear_lianxu_space(assessment_en))
                        # print("item['assessment_en']1: ", item['assessment_en'])
                    elif u == 1:
                        overview = response.xpath(
                            """//h2[contains(text(),"Professional placement year")]/following-sibling::div//h3[contains(text(),'Overview')]/.."""
                        ).extract()
                        if len(overview) == 0:
                            overview = response.xpath(
                                """//h2[contains(text(),"Professional placement year")]/following-sibling::div//h3[contains(text(),'overview')]/.."""
                            ).extract()
                            if len(overview) == 0:
                                overview = response.xpath(
                                    """//h3[contains(text(),'Overview')]/..|//h3[contains(text(),'overview')]/.."""
                                ).extract()
                        item['overview_en'] = remove_class(
                            clear_lianxu_space(overview))
                        # print("item['overview_en']2: ", item['overview_en'])

                        assessment_en = response.xpath(
                            "//h2[contains(text(),'Professional placement year')]/following-sibling::div//h3[contains(text(),'How will I be assessed?')]/..|"
                            "//h2[contains(text(),'Professional placement year')]/following-sibling::div//h3[contains(text(),'How will I be taught?')]/..|"
                            "//h2[contains(text(),'Professional placement year')]/following-sibling::div//h3[contains(text(),'Assessment')]/.."
                        ).extract()
                        if len(assessment_en) == 0:
                            assessment_en = response.xpath(
                                "//h3[contains(text(),'How will I be assessed?')]/..|"
                                "//h3[contains(text(),'How will I be taught?')]/..|"
                                "//h3[contains(text(),'Assessment')]/.."
                            ).extract()
                        item['assessment_en'] = remove_class(
                            clear_lianxu_space(assessment_en))
                        # print("item['assessment_en']2: ", item['assessment_en'])
                    yield item
            else:
                overview = response.xpath(
                    """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'Overview')]/.."""
                ).extract()
                if len(overview) == 0:
                    overview = response.xpath(
                        """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'overview')]/.."""
                    ).extract()
                item['overview_en'] = remove_class(
                    clear_lianxu_space(overview))
                # print("item['overview_en']1: ", item['overview_en'])

                assessment_en = response.xpath(
                    """//h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'How will I be assessed?')]/..|
                    //h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'How will I be taught?')]/..|
                    //h2[contains(text(),"What you'll learn")]/following-sibling::div//h3[contains(text(),'Assessment')]/.."""
                ).extract()
                item['assessment_en'] = remove_class(
                    clear_lianxu_space(assessment_en))
                # print("item['assessment_en']1: ", item['assessment_en'])
                yield item

        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)


#            department_dict = {"arts management":"Bath Business School","accounting and finance":"Bath Business School",
# "business and management":"Bath Business School",
# "business and management (accounting)":"Bath Business School",
# "business and management (entrepreneurship)":"Bath Business School",
# "business and management (international business)":"Bath Business School",
# "business and management (marketing)":"Bath Business School",
# "curatorial practice":"Bath School of Art and Design",
# "design (ceramics)":"Bath School of Art and Design",
# "design (fashion and textiles)":"Bath School of Art and Design",
# "fine art":"Bath School of Art and Design",
# "visual communication":"Bath School of Art and Design",
# "children's publishing":"College of Liberal Arts",
# "classical acting":"College of Liberal Arts",
# "composition":"College of Liberal Arts",
# "creative producing":"College of Liberal Arts",
# "creative writing":"College of Liberal Arts",
# "creative writing phd":"College of Liberal Arts",
# "crime and gothic fictions":"College of Liberal Arts",
# "dance":"College of Liberal Arts",
# "directing":"College of Liberal Arts",
# "directing circus":"College of Liberal Arts",
# "environmental humanities":"College of Liberal Arts",
# "environmental management":"College of Liberal Arts",
# "feature filmmaking":"College of Liberal Arts",
# "heritage management":"College of Liberal Arts",
# "intercultural musicology":"College of Liberal Arts",
# "liberal arts":"College of Liberal Arts",
# "literature, landscape and environment":"College of Liberal Arts",
# "music performance":"College of Liberal Arts",
# "performing shakespeare":"College of Liberal Arts",
# "principles of applied neuropsychology":"College of Liberal Arts",
# "scriptwriting":"College of Liberal Arts",
# "songwriting (campus based)":"College of Liberal Arts",
# "songwriting (distance learning)":"College of Liberal Arts",
# "sound (arts)":"College of Liberal Arts",
# "sound (design)":"College of Liberal Arts",
# "sound (production)":"College of Liberal Arts",
# "theatre for young audiences":"College of Liberal Arts",
# "transnational writing":"College of Liberal Arts",
# "travel and nature writing":"College of Liberal Arts",
# "writing for young people":"College of Liberal Arts",
# "counselling and psychotherapy practice":"Institute for Education",
# "education (education studies)":"Institute for Education",
# "education (early childhood studies)":"Institute for Education",
# "education (international education)":"Institute for Education",
# "education (leadership and management)":"Institute for Education",
# "inclusive education":"Institute for Education",
# "professional practice":"Institute for Education",
# "professional practice in higher education":"Institute for Education",
# "teaching english to speakers of other languages":"Institute for Education",
# "specific learning difficulties / dyslexia":"Institute for Education",
# "national award for special educational needs coordination":"Institute for Education",
# "professional doctorate in education":"Institute for Education",
# }
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Chester"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            programme = response.xpath(
                "//h1[@id='main-content']/text()").extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath(
                "//h1[@id='main-content']/div//text()").extract()
            item['degree_name'] = ''.join(degree_type).replace("(Hons)",
                                                               "").strip()
            print("item['degree_name']: ", item['degree_name'])

            start_date = response.xpath(
                "//select[@id='edit-date']/option//text()|//label[@for='edit-date']/following-sibling::span//text()"
            ).extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            start_date_str = ""
            if len(start_date) > 0:
                for s in start_date:
                    # start_date_str = getStartDate(s)
                    if getStartDate(s) != None:
                        start_date_str += getStartDate(s) + ", "
            item['start_date'] = start_date_str.strip().strip(',').strip()
            # print("item['start_date']: ", item['start_date'])

            mode = response.xpath(
                "//select[@id='edit-mode']//text()").extract()
            clear_space(mode)
            # item['teach_time'] = getTeachTime(''.join(mode))
            # print("mode: ", mode)

            location = response.xpath(
                "//label[@for='edit-compulsory']/following-sibling::*//text()"
            ).extract()
            item['location'] = ''.join(location).strip()
            # print("item['location']: ", item['location'])

            ucascode = response.xpath(
                "//dt[contains(text(),'UCAS Code')]/following-sibling::*//text()"
            ).extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            print("item['ucascode'] = ", item['ucascode'])

            duration = response.xpath(
                "//dt[contains(text(),'Duration')]/following-sibling::*//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            overview_en = response.xpath(
                "//h3[contains(text(),'Course overview')]/../*[position()<last()]|"
                "//div[@class='m-body__margin-bottom t-course__overview']"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # print("item['overview_en']: ", item['overview_en'])

            entry_requirements = response.xpath(
                "//div[@id='entry-international']//form[@id='courses-international-form']/preceding-sibling::*//text()"
            ).extract()
            # item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            alevel = response.xpath(
                "//td[contains(text(),'GCE A Level')]/following-sibling::*//text()"
            ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            # print("item['alevel']: ", item['alevel'])

            ib = response.xpath(
                "//td[contains(text(),'International Baccalaureate')]/following-sibling::*//text()"
            ).extract()
            item['ib'] = clear_lianxu_space(ib)
            # print("item['ib']: ", item['ib'])

            ielts_desc = response.xpath(
                "//div[@id='entry-international']//li[contains(text(),'Undergraduate:')]//text()"
            ).extract()
            clear_space(ielts_desc)
            item['ielts_desc'] = ''.join(ielts_desc).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            assessment_en = response.xpath(
                "//h3[@class='field-label'][contains(text(),'How will I be taught?')]/..|"
                "//h3[@class='field-label'][contains(text(),'How will I be assessed?')]/.."
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            tuition_fee = response.xpath(
                "//div[@class='field-fees-international']/p//text()|"
                "//p[contains(text(),'The tuition fees for international students studyi')]//text()"
            ).extract()
            print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']: ", item['tuition_fee'])
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            career_en = response.xpath(
                "//div[@id='careers-career-services']").extract()
            item['career_en'] = remove_class(
                clear_lianxu_space(career_en)).replace("<div></div>",
                                                       "").strip()
            print("item['career_en']: ", item['career_en'])

            modules = re.findall(
                r"function\sinit_drupal_core_settings\(\)\s{jQuery\.extend\(Drupal\.settings,.*}",
                response.text)
            # print("modules: ", modules)
            modules_str = ''.join(modules).replace(
                "function init_drupal_core_settings() {jQuery.extend(Drupal.settings,",
                "").strip()
            modules_dict = json.loads(modules_str)
            # print("modules_dict: ", modules_dict)
            # groupCode     modulesNid
            # print(modules_dict.get("courses"))
            if modules_dict.get('courses').get('groupCode') is not False:
                modules_json = "https://www1.chester.ac.uk/courses/modules/ajax/" + modules_dict.get(
                    'courses').get('modulesNid') + "/" + modules_dict.get(
                        'courses').get('groupCode') + "/389"
                # print("modules_json: ", modules_json)
                mdict = json.loads(requests.get(modules_json).text)
                # print("mdict: ", len(mdict))
                m = mdict[-1].get('data')
                if m != None:
                    item['modules_en'] = remove_class(clear_lianxu_space([m]))
            # print("item['modules_en']: ", item['modules_en'])

            item[
                'apply_proces_en'] = "https://www1.chester.ac.uk/undergraduate/how-apply/applying-full-time-courses"
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="field-collection-view clearfix view-mode-full">
  <h3 class="field-course-type">
    Undergraduate Study  </h3>

  <ul><li>UK foundation/pathway course with a pass mark of 50% and above.  Engineering courses require an additional mark of at least 55% in a Maths module. </li>
<li>China 3 year National Senior High School Certificate with 80% or above</li>
<li>Gaokao (College Entry Exam) with good grades </li>
<li>Dazhuan considered for entry to 3rd year UG</li>
<li>BFSU Foundation Year at 60% or above</li>
<li>Dongfang International Centre for Education Exchange Top University Foundation Course 60% or above</li>
<li>East and West International Education (EWIE)/ Wiseway Global International Foundation Certificate at 60% or above</li>
<li>Graduation Certificate from a specialised College/School (Zhongzuhan) with 80% or above</li>
</ul></div>"""
                ]))
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            # department = response.xpath("//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()").extract()
            # item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
示例#24
0
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "The University of Sheffield"
        item['url'] = response.meta['url']
        print("===============================")
        print(response.url)
        print(response.meta['url'])
        try:
            # 专业、学位类型
            programmeDegree_type = response.xpath(
                "//div[@class='titles']/h2//text()").extract()
            if len(programmeDegree_type) == 0:
                programmeDegree_type = response.xpath(
                    "//main[@class='main content']/h2[1]//text()").extract()
            programmeDegree_type = ''.join(programmeDegree_type).strip()
            print("programmeDegree_type: ", programmeDegree_type)
            degree_typeList = re.findall(r"[A-Za-z/\(\)]*$",
                                         programmeDegree_type)
            # print("degree_typeList: ", degree_typeList)
            programme = programmeDegree_type
            if len(degree_typeList) != 0:
                degree_type = ''.join(list(degree_typeList[0]))
                item['degree_name'] = degree_type
                programme = programmeDegree_type.replace(
                    item['degree_name'], '')
            print("item['degree_name']: ", item['degree_name'])
            item['programme_en'] = ''.join(programme)
            print("item['programme_en']: ", item['programme_en'])

            ucascode = response.xpath("//span[@id='adCode']//text()").extract()
            clear_space(ucascode)
            if len(ucascode) > 0:
                item['ucascode'] = ''.join(ucascode[0]).strip()
            print("item['ucascode'] = ", item['ucascode'])

            alevel = response.xpath(
                "//html//div[@id='courseSummary']//tr/td[contains(text(), 'A Levels')]/following-sibling::td//text()"
            ).extract()
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[0]).strip()
            # print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//html//div[@id='courseSummary']//tr/td[contains(text(), 'International Baccalaureate')]/following-sibling::td//text()"
            ).extract()
            item['ib'] = ''.join(ib).strip()
            # print("item['ib'] = ", item['ib'])

            tuition_feeDict = {
                "C180": "21450",
                "C200": "21450",
                "C300": "21450",
                "C100": "21450",
                "C109": "21450",
                "C189": "21450",
                "C209": "21450",
                "C309": "21450",
                "C1C9": "21450",
                "C1CX": "21450",
                "C1R9": "21450",
                "C101": "21450",
                "F400": "18900",
                "FV41": "18900",
                "VV46": "18900",
                "VR47": "18900",
                "VR41": "18900",
                "VR42": "18900",
                "F410": "18900",
                "VR44": "18900",
                "QV84": "18900",
                "F401": "18900",
                "KK13": "21450",
                "K100": "21450",
                "ARCU123": "21450",
                "ARCU124": "21450",
                "ARCU13": "21450",
                "ARCU129": "21450",
                "Y001": "16800",
                "H130": "21450",
                "G500": "21450",
                "H690": "21450",
                "H660": "21450",
                "H310": "21450",
                "H360": "21450",
                "H361": "21450",
                "H1NF": "21450",
                "H1NF": "21450",
                "HN62": "21450",
                "OG31": "21450",
                "8L16": "21450",
                "57": "21450",
                "2G36": "21450",
                "8M74": "21450",
                "2A47": "21450",
                "H653": "21450",
                "H659": "21450",
                "B900": "21450",
                "B909": "21450",
                "H810": "21450",
                "H800": "21450",
                "H840": "21450",
                "H8T9": "21450",
                "H8F1": "21450",
                "H8J7": "21450",
                "H801": "21450",
                "F100": "21450",
                "F105": "21450",
                "F107": "21450",
                "F106": "21450",
                "F335": "21450",
                "F109": "21450",
                "F108": "21450",
                "C720": "21450",
                "H210": "21450",
                "HK21": "21450",
                "H2T9": "21450",
                "H200": "21450",
                "H202": "21450",
                "HK2D": "21450",
                "H2N2": "21450",
                "2H26": "21450",
                "8T63": "21450",
                "8L55": "21450",
                "2G91": "21450",
                "H201": "21450",
                "A200": "21450",
                "G600": "21450",
                "G650": "21450",
                "G402": "21450",
                "G400": "21450",
                "GG41": "21450",
                "GG74": "21450",
                "G4G1": "21450",
                "G700": "21450",
                "G490": "21450",
                "G495": "21450",
                "G401": "21450",
                "G651": "21450",
                "GN52": "21450",
                "GN53": "21450",
                "X301": "16800",
                "F401": "18900",
                "Q305": "16800",
                "Q310": "16800",
                "F901": "18900",
                "L701": "18900",
                "V101": "16800",
                "Q307": "16800",
                "V501": "16800",
                "L301": "16800",
                "L401": "16800",
                "K401": "16800",
                "K441": "16800",
                "L790": "16800",
                "QC19": "21450",
                "B990": "21450",
                "C801": "21450",
                "V642": "16800",
                "L432": "16800",
                "T210": "16800",
                "T300": "18900",
                "TN42": "18900",
                "T110": "16800",
                "T415": "16800",
                "TN12": "18900",
                "T1T2": "16800",
                "T4T2": "16800",
                "T1R2": "16800",
                "T2R2": "16800",
                "T1R4": "16800",
                "T2R4": "16800",
                "T1R7": "16800",
                "T2R7": "16800",
                "T1R1": "16800",
                "TV11": "16800",
                "TV21": "16800",
                "L100": "16800",
                "LV15": "16800",
                "LL12": "16800",
                "L101": "16800",
                "LG11": "16800",
                "L1N3": "16800",
                "LIN3": "16800",
                "X300": "16800",
                "X301": "16800",
                "H620": "21450",
                "H621": "21450",
                "H610": "21450",
                "H613": "21450",
                "H614": "21450",
                "H651": "21450",
                "H647": "21450",
                "H645": "21450",
                "H6T9": "21450",
                "H623": "21450",
                "H615": "21450",
                "H616": "21450",
                "H652": "21450",
                "H649": "21450",
                "H622": "21450",
                "H611": "21450",
                "H648": "21450",
                "H629": "21450",
                "H628": "21450",
                "H602": "21450",
                "H603": "21450",
                "H100": "21450",
                "H104": "21450",
                "H675": "21450",
                "H673": "21450",
                "H67I": "21450",
                "H67H": "21450",
                "Q3Q1": "16800",
                "QL33": "16800",
                "QR14": "16800",
                "QR17": "16800",
                "QR32": "16800",
                "QR37": "16800",
                "QV15": "16800",
                "QT12": "16800",
                "Q304": "16800",
                "Q310": "16800",
                "Q305": "16800",
                "Q306": "16800",
                "QR31": "16800",
                "QV31": "16800",
                "QW33": "16800",
                "QV35": "16800",
                "QR34": "16800",
                "QW34": "16800",
                "Q307": "16800",
                "F309": "21450",
                "G109": "21450",
                "QR11": "16800",
                "R120": "16800",
                "RL11": "16800",
                "RL12": "16800",
                "RN12": "16800",
                "RR12": "16800",
                "RR14": "16800",
                "RR17": "16800",
                "RV11": "16800",
                "RV15": "16800",
                "RW13": "16800",
                "R1R9": "16800",
                "R1T2": "16800",
                "R1R7": "16800",
                "R1RR": "16800",
                "R1RO": "16800",
                "L700": "18900",
                "F800": "18900",
                "F902": "18900",
                "F900": "18900",
                "F901": "18900",
                "QR12": "16800",
                "R220": "16800",
                "RL21": "16800",
                "RL22": "16800",
                "RN22": "16800",
                "RR24": "16800",
                "RR27": "16800",
                "RV21": "16800",
                "RV25": "16800",
                "RW23": "18900",
                "R2R9": "16800",
                "R2T2": "16800",
                "R2R7": "16800",
                "R2RR": "16800",
                "R2R3": "16800",
                "R410": "16800",
                "RL42": "16800",
                "RN42": "16800",
                "RL41": "16800",
                "R4T2": "16800",
                "R4R7": "16800",
                "R4RR": "16800",
                "V100": "16800",
                "RV71": "16800",
                "RV41": "16800",
                "VV15": "16800",
                "VL12": "16800",
                "VL13": "16800",
                "V1R9": "16800",
                "V101": "16800",
                "B620": "21450",
                "QC18": "21450",
                "QC19": "21450",
                "P110": "18900",
                "P500": "18900",
                "K3K4": "18900",
                "KC39": "18900",
                "M100": "16800",
                "ML94": "16800",
                "M1R4": "16800",
                "M1R2": "16800",
                "M1R1": "16800",
                "M930": "16800",
                "M120": "16800",
                "N200": "16800",
                "N420": "16800",
                "NG21": "16800",
                "NG41": "16800",
                "NL21": "16800",
                "NL41": "16800",
                "NP21": "16800",
                "NP41": "16800",
                "NT22": "16800",
                "N120": "16800",
                "JH51": "21450",
                "J500": "21450",
                "J5R9": "21450",
                "FH21": "21450",
                "J200": "21450",
                "FHF1": "21450",
                "H403": "21450",
                "H401": "21450",
                "JH5P": "21450",
                "JH56": "21450",
                "J501": "21450",
                "G100": "18900",
                "G103": "18900",
                "GN13": "18900",
                "G102": "18900",
                "G1R4": "18900",
                "G1R1": "18900",
                "G1R2": "18900",
                "G106": "18900",
                "VG51": "18900",
                "A100": "21450",
                "T900": "16800",
                "C400": "21450",
                "C500": "21450",
                "C440": "21450",
                "C700": "21450",
                "C741": "21450",
                "CC45": "21450",
                "CC74": "21450",
                "CC75": "21450",
                "C709": "21450",
                "CC7C": "21450",
                "CC79": "21450",
                "C409": "21450",
                "CC4C": "21450",
                "C749": "21450",
                "C509": "21450",
                "C449": "21450",
                "C790": "21450",
                "C791": "21450",
                "CC47": "21450",
                "CC4R": "21450",
                "C431": "21450",
                "C433": "21450",
                "C521": "21450",
                "C523": "21450",
                "W302": "18900",
                "RW43": "18900",
                "VW53": "18900",
                "WT34": "18900",
                "WT31": "18900",
                "WTH4": "18900",
                "B991": "21450",
                "B740": "21450",
                "B990": "21450",
                "B520": "21450",
                "QV36": "16800",
                "RV26": "16800",
                "QV16": "16800",
                "VW63": "16800",
                "VV56": "16800",
                "VR61": "16800",
                "BIBU08": "16800",
                "V641": "16800",
                "V500": "16800",
                "RV45": "16800",
                "V501": "16800",
                "F300": "21450",
                "F301": "21450",
                "F344": "21450",
                "F350": "21450",
                "FF35": "21450",
                "F371": "21450",
                "F3F5": "21450",
                "FV35": "21450",
                "F321": "21450",
                "F3G4": "21450",
                "F3GK": "21450",
                "F305": "21450",
                "F304": "21450",
                "F3F5": "21450",
                "L210": "16800",
                "LL23": "16800",
                "LV25": "16800",
                "L201": "16800",
                "LL24": "16800",
                "C800": "21450",
                "C802": "21450",
                "C801": "21450",
                "R710": "16800",
                "RL71": "16800",
                "RL72": "16800",
                "RN72": "16800",
                "RR47": "16800",
                "R7R7": "16800",
                "R7RR": "16800",
                "RV75": "16800",
                "RW73": "18900",
                "R7T2": "16800",
                "L300": "16800",
                "LL43": "16800",
                "NL2K": "16800",
                "NL24": "16800",
                "L391": "16800",
                "L301": "16800",
                "L401": "16800",
                "L722": "16800",
                "TRPU105": "16800",
                "LK74": "18900",
                "K401": "16800",
                "K441": "16800",
                "L790": "16800",
            }
            tuition_fee = tuition_feeDict.get(item['ucascode'])
            print("tuition_fee: ", tuition_fee)
            if tuition_fee != None:
                item['tuition_fee'] = int(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']: ", item['tuition_fee'])
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
示例#25
0
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "St George's, University of London"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = "Cranmer Terrace, London SW17 0RE"
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            programmeDegree_name = response.xpath(
                "//div[@class='inner']/h1//text()").extract()
            programmeDegree_nameStr = ''.join(programmeDegree_name).strip()
            # print("programmeDegree_nameStr: ", programmeDegree_nameStr)

            if "Foundation" not in programmeDegree_nameStr:
                degree_name = re.findall(r"\(.*\)$|\w+\s\(.*\)$|\w+$",
                                         programmeDegree_nameStr)
                degree_name_str = ''.join(degree_name).strip()
                item['degree_name'] = degree_name_str.replace("(", "").replace(
                    ")", "").replace("Hons", "").strip()
                print("item['degree_name']: ", item['degree_name'])

                programme = programmeDegree_nameStr.replace(
                    degree_name_str, "").strip()
                item['programme_en'] = programme
                print("item['programme_en']: ", item['programme_en'])

                ucascode = response.xpath(
                    "//*[contains(text(),'UCAS code')]//text()").extract()
                clear_space(ucascode)
                # print("ucascode: ", ucascode)
                if len(ucascode) > 0:
                    ucascode_re = re.findall(r"UCAS\scode\s\w{4}",
                                             ''.join(ucascode))
                    # print("ucascode_re: ", ucascode_re)
                    item['ucascode'] = ''.join(ucascode_re).replace(
                        "UCAS code", "").strip()
                # print("item['ucascode'] = ", item['ucascode'])

                other = response.xpath(
                    "//img[@alt='globe']/../..//text()").extract()
                if len(other) == 0:
                    other = response.xpath(
                        "//td[contains(text(),'Open to UK and EU students. Not currently open to ')]//text()"
                    ).extract()
                item['other'] = clear_lianxu_space(other)
                # print("item['other'] = ", item['other'])

                # start_date = response.xpath("//dt[contains(text(), 'Start date')]/following-sibling::dd[1]//text()").extract()
                # clear_space(start_date)
                # # print("start_date: ", start_date)
                # item['start_date'] = getStartDate(''.join(start_date))
                # # print("item['start_date']: ", item['start_date'])

                duration = response.xpath(
                    "//img[@alt='Calendar']/../following-sibling::td//text()"
                ).extract()
                if len(duration) == 0:
                    duration = response.xpath(
                        "//img[@alt='Calendar']/../../following-sibling::td//text()"
                    ).extract()
                clear_space(duration)
                # print("duration: ", ''.join(duration))

                duration_list = getIntDuration(''.join(duration))
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
                # print("item['duration'] = ", item['duration'])
                # print("item['duration_per'] = ", item['duration_per'])

                # //p[contains(text(),'Non-UK/EU (International) application deadline')]
                deadline = response.xpath(
                    "//*[contains(text(),'Application deadline')]//text()|//*[contains(text(),'UCAS deadline')]//text()"
                ).extract()
                clear_space(deadline)
                # print("deadline: ", deadline)
                item['deadline'] = getStartDate(''.join(deadline).replace(
                    "Application deadline",
                    "").replace("is", "").replace("UCAS deadline",
                                                  "").replace(":", "").strip())
                if "2018" not in item['deadline'] and item[
                        'deadline'] != "" and "2019" not in item['deadline']:
                    item['deadline'] = ''.join(deadline).replace(
                        "Application deadline",
                        "").replace("is", "").replace("UCAS deadline",
                                                      "").replace(":",
                                                                  "").strip()
                # print("item['deadline']: ", item['deadline'])

                # location = response.xpath("//*[contains(text(),'Study location:')]//text()").extract()
                # item['location'] = ''.join(location).replace("Study location:", "").strip()
                # print("item['location']: ", item['location'])

                tuition_fee = response.xpath(
                    "//h3[contains(text(),'International (Non-EU) Student Fees')]/following-sibling::table//td[contains(text(),'2019/20')]/following-sibling::td[1]//text()|"
                    "//table//p[contains(text(),'2018 entry Non-EU')]//text()|"
                    "//table[2]/tbody/tr[4]/td/p[contains(text(),'2018 Non-EU')]/following-sibling::*/*[1]//text()|"
                    "//table//p[contains(text(),'2018 Non-EU')]/following-sibling::*[1]/*[1]//text()"
                ).extract()
                clear_space(tuition_fee)
                # print("tuition_fee: ", ''.join(tuition_fee))
                tuition_fee_re = re.findall(r"\d+,\d+", ''.join(tuition_fee))
                if len(tuition_fee_re) > 0:
                    item['tuition_fee'] = getTuition_fee(
                        ''.join(tuition_fee_re))
                # print("item['tuition_fee']: ", item['tuition_fee'])

                overview_en = response.xpath(
                    "//p[@class='first']|//table[1]/following-sibling::*[position()<last()-1]"
                ).extract()
                item['overview_en'] = remove_class(
                    clear_lianxu_space(overview_en)).replace(
                        "<p><button>Make an enquiry</button></p>", "").strip()
                # print("item['overview_en']: ", item['overview_en'])

                entry_url = response.xpath(
                    "//a[contains(text(),'Entry')]/@href").extract()
                # print("entry_url: ", entry_url)
                if len(entry_url) != 0:
                    parse_entry_url = "https://www.sgul.ac.uk" + entry_url[0]
                    # print("parse_entry_url: ", parse_entry_url)
                    entry_dict = self.parse_rntry_requirements(parse_entry_url)
                    # print(entry_dict)
                    # item['rntry_requirements'] = entry_dict.get('rntry_requirements')

                    item['ielts_desc'] = entry_dict.get('ielts_desc')
                    item['alevel'] = entry_dict.get('alevel')
                    item['ib'] = entry_dict.get('ib')
                # print("item['ielts_desc']: ", item['ielts_desc'])
                # print("item['alevel']: ", item['alevel'])
                # print("item['ib']: ", item['ib'])

                ielts_list = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
                if len(ielts_list) == 1:
                    item['ielts'] = ielts_list[0]
                    item['ielts_l'] = ielts_list[0]
                    item['ielts_s'] = ielts_list[0]
                    item['ielts_r'] = ielts_list[0]
                    item['ielts_w'] = ielts_list[0]
                elif len(ielts_list) == 2:
                    item['ielts'] = ielts_list[0]
                    item['ielts_l'] = ielts_list[1]
                    item['ielts_s'] = ielts_list[1]
                    item['ielts_r'] = ielts_list[1]
                    item['ielts_w'] = ielts_list[1]
                elif len(ielts_list) == 5:
                    item['ielts'] = ielts_list[0]
                    item['ielts_l'] = ielts_list[1]
                    item['ielts_s'] = ielts_list[4]
                    item['ielts_r'] = ielts_list[2]
                    item['ielts_w'] = ielts_list[3]
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                modules_url = response.xpath(
                    "//a[contains(text(),'Module')]/@href").extract()
                # print("modules_url: ", modules_url)
                if len(modules_url) != 0:
                    parse_modules_url = "https://www.sgul.ac.uk" + modules_url[
                        0]
                    # print("parse_modules_url: ", parse_modules_url)
                    item['modules_en'] = remove_class(
                        clear_lianxu_space(
                            self.parse_modules(parse_modules_url))).strip()
                # print("item['modules_en']: ", item['modules_en'])

                assessment_en_url = response.xpath(
                    "//a[contains(text(),'Studying')]/@href").extract()
                # print("assessment_en_url: ", assessment_en_url)
                if len(assessment_en_url) != 0:
                    parse_assessment_en_url = "https://www.sgul.ac.uk" + assessment_en_url[
                        0]
                    # print("parse_assessment_en_url: ", parse_assessment_en_url)
                    item['assessment_en'] = remove_class(
                        clear_lianxu_space(
                            self.parse_assessment_en(
                                parse_assessment_en_url))).strip()
                # print("item['assessment_en']: ", item['assessment_en'])

                career_en_url = response.xpath(
                    "//a[contains(text(),'Career')]/@href").extract()
                # print("career_en_url: ", career_en_url)
                if len(career_en_url) != 0:
                    parse_career_en_url = "https://www.sgul.ac.uk" + career_en_url[
                        0]
                    # print("parse_career_en_url: ", parse_career_en_url)
                    item['career_en'] = remove_class(
                        clear_lianxu_space(
                            self.parse_career_en(
                                parse_career_en_url))).replace(
                                    "<p><img></p>", "").strip()
                # print("item['career_en']: ", item['career_en'])

                apply_proces_en_url = response.xpath(
                    "//a[contains(text(),'Apply')]/@href|//a[contains(text(),'Application and interview')]/@href"
                ).extract()
                # print("apply_proces_en_url: ", apply_proces_en_url)
                if len(apply_proces_en_url) != 0:
                    parse_apply_proces_en_url = "https://www.sgul.ac.uk" + apply_proces_en_url[
                        0]
                    # print("parse_apply_proces_en_url: ", parse_apply_proces_en_url)
                    item['apply_proces_en'] = remove_class(
                        clear_lianxu_space(
                            self.parse_apply_proces_en(
                                parse_apply_proces_en_url))).replace(
                                    "<p><img></p>", "").strip()
                # print("item['apply_proces_en']: ", item['apply_proces_en'])

                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
示例#26
0
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Bristol"
        # items['country'] = "England"
        # items["website"] = "https://www.bristol.ac.uk/"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            # 专业
            course = response.xpath(
                "//span[@property='programname']//text()").extract()
            # print("course = ", course)
            item['programme_en'] = ''.join(course).replace("\n", " ").replace(
                "\r", " ").strip()
            print("item['programme_en']: ", item['programme_en'])

            # degreeaward
            degreeaward = response.xpath(
                "//span[@property='award']//text()").extract()
            # print("degreeaward = ", degreeaward)
            item['degree_name'] = clear_space_str(''.join(degreeaward))
            print("item['degree_name']: ", item['degree_name'])

            ucascode = response.xpath(
                "//th[contains(text(),'UCAS code')]/../td//text()").extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            # print("item['ucascode']: ", item['ucascode'])
            if item['ucascode'] == "":
                ucascode = response.xpath(
                    "//th[contains(text(),'Application method')]/following-sibling::td//text()"
                ).extract()
                clear_space(ucascode)
                # print("ucascode1: ", ucascode)
                item['ucascode'] = ''.join(ucascode).replace(
                    "Entry by transfer from", "").replace(
                        "Entry by transfer after two years from",
                        "").replace("Entry by transfer after year one of",
                                    "").replace("at the end of year one",
                                                "").strip()
            # print("item['ucascode']1: ", item['ucascode'])

            # duration
            duration = response.xpath(
                "//th[contains(text(),'Course duration')]/../td//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)

            # duration_list = getIntDuration(''.join(duration))
            if len(duration) == 2:
                item['duration'] = int(duration[0])
                if 'y' in ''.join(duration).lower():
                    item['duration_per'] = 1
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            # location
            location = response.xpath(
                "//th[contains(text(),'Location of course')]/../td//text()"
            ).extract()
            item['location'] = clear_space_str(''.join(location))
            # print("item['location']: ", item['location'])

            # startdate
            startdate = response.xpath(
                "//p[@class='year-of-entry']/text()").extract()
            clear_space(startdate)
            # print("startdate = ", startdate)
            if len(startdate) > 0:
                item['start_date'] = ''.join(startdate).replace("entry",
                                                                "").strip()
            # print("item['start_date'] = ", item['start_date'])

            tuitionFee = response.xpath(
                "//li[contains(text(),'International students: £')]//text()"
            ).extract()
            # print("tuitionFee = ", tuitionFee)
            if len(tuitionFee) > 0:
                item['tuition_fee_pre'] = "£"
                item['tuition_fee'] = getTuition_fee(''.join(tuitionFee))
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])
            # print("item['tuition_fee']: ", item['tuition_fee'])

            # deadline
            # deadline = response.xpath("//div[@id='apply']/div[@class='apply-deadline']/p[1]//text()").extract()
            # # print("deadline = ", deadline)
            # item['deadline'] = getStartDate(''.join(deadline))
            # # print("item['deadline']: ", item['deadline'])

            # department
            department = response.xpath(
                "//div[@id='contact']/p[@class='pg-contact-address']/text()"
            ).extract()
            clear_space(department)
            print("department1 = ", department)
            for d in department:
                if "School" in d or "Faculty" in d:
                    item['department'] = d
            # print("item['department']: ", item['department'])
            if item['department'] == "":
                allcontent = response.xpath(
                    "//main[@class='content']//text()").extract()
                clear_space(allcontent)
                department_re = re.findall(r"School\sof.{1,30}",
                                           ''.join(allcontent), re.I)
                # print("department_re: ", department_re)
                if len(department_re) > 0:
                    item['department'] = department_re[0].strip()
            # print("item['department']1: ", item['department'])

            overview = response.xpath(
                "//div[@id='course-description']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            assessment_en = response.xpath(
                "//div[@id='teaching']|//div[@id='assessment']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            print("item['assessment_en']: ", item['assessment_en'])

            alevel = response.xpath(
                "//div[@id='typical-offer']//table//tr/th[contains(text(), 'A-level')]/../td//text()"
            ).extract()
            clear_space(alevel)
            # print(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[0]).strip()
            # print("item['alevel'] = ", item['alevel'])

            # print(len("36 points overall with 18 at Higher Level, including 6, 5 at Higher Level in two of the following subjects: Biology, Chemistry, Physics, Mathematics, Psychology"))
            if len(item['alevel']) > 160:
                item['alevel'] = ''.join(item['alevel'][:161])
            # print("item['alevel']1 = ", item['alevel'])

            ib = response.xpath(
                "//div[@id='typical-offer']//table//tr/th[contains(text(), 'International Baccalaureate ')]/../td//text()"
            ).extract()
            clear_space(ib)
            if len(ib) > 0:
                item['ib'] = ''.join(ib[0]).strip()

            if len(item['ib']) > 160:
                item['ib'] = ''.join(item['ib'][:161])
            # print("item['ib'] = ", item['ib'])

            # 课程结构
            modulesUrl = response.xpath(
                "//div[@id='course-structure']//div[@class='collapsible']//a/@href"
            ).extract()
            # print("modulesUrl: ", modulesUrl)
            modulesUrl = ''.join(modulesUrl).strip()
            if len(modulesUrl) != 0:
                item['modules_en'] = self.parse_modules_en(modulesUrl)[0]
                # print("item['modules_en']: ", item['modules_en'])
                u = self.parse_modules_en(modulesUrl)[1]
                # print(u)
                while len(u) != 0:
                    u1 = "https://www.bris.ac.uk" + ''.join(u)
                    # print("u1=", u1)
                    item['modules_en'] += self.parse_modules_en(u1)[0]
                    u = self.parse_modules_en(u1)[1]
            # print("item['modules_en']1: ", item['modules_en'])

            # 学术要求本科特殊专业要求、IELTS
            entryRequirements = response.xpath(
                "//div[@id='typical-offer']//text()").extract()
            # item['rntry_requirements'] = clear_lianxu_space(entryRequirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts = response.xpath(
                "//*[contains(text(),'Profile A')]//text()|//*[contains(text(),'Profile B')]//text()|"
                "//*[contains(text(),'Profile C')]//text()|//*[contains(text(),'Profile D')]//text()|"
                "//*[contains(text(),'Profile E')]//text()|//*[contains(text(),'Profile F')]//text()"
            ).extract()
            item['ielts_desc'] = clear_lianxu_space(ielts)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            if item['ielts_desc'] == "Profile A":
                item['ielts'] = 7.5
                item['ielts_l'] = 7.0
                item['ielts_s'] = 7.0
                item['ielts_r'] = 7.0
                item['ielts_w'] = 7.0
                item['toefl'] = 109
                item['toefl_l'] = 25
                item['toefl_r'] = 25
                item['toefl_s'] = 25
                item['toefl_w'] = 29
            elif item['ielts_desc'] == "Profile B":
                item['ielts'] = 7.0
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.5
                item['ielts_w'] = 7.0
                item['toefl'] = 100
                item['toefl_l'] = 24
                item['toefl_r'] = 24
                item['toefl_s'] = 24
                item['toefl_w'] = 27
            elif item['ielts_desc'] == "Profile C":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.5
                item['ielts_w'] = 6.5
                item['toefl'] = 92
                item['toefl_l'] = 23
                item['toefl_r'] = 23
                item['toefl_s'] = 23
                item['toefl_w'] = 24
            elif item['ielts_desc'] == "Profile D":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.0
                item['ielts_s'] = 6.0
                item['ielts_r'] = 7.0
                item['ielts_w'] = 7.0
                item['toefl'] = 92
                item['toefl_l'] = 21
                item['toefl_r'] = 21
                item['toefl_s'] = 21
                item['toefl_w'] = 27
            elif item['ielts_desc'] == "Profile E":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.0
                item['ielts_s'] = 6.0
                item['ielts_r'] = 6.0
                item['ielts_w'] = 6.0
                item['toefl'] = 90
                item['toefl_l'] = 20
                item['toefl_r'] = 20
                item['toefl_s'] = 20
                item['toefl_w'] = 20
            elif item['ielts_desc'] == "Profile F":
                item['ielts'] = 6.0
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.0
                item['ielts_w'] = 6.0
                item['toefl'] = 86
                item['toefl_l'] = 20
                item['toefl_r'] = 20
                item['toefl_s'] = 20
                item['toefl_w'] = 23
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #       item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            # assessment_en, career_en
            assessCareerUrlSplit = response.url.rsplit('/')
            assessCareerUrl = response.url.replace(
                assessCareerUrlSplit[-2] + "/", "").strip()
            print(assessCareerUrl)
            assessCareerDict = self.parseAssessCareer(assessCareerUrl)

            item['assessment_en'] = assessCareerDict.get(
                'assessment_en').strip()
            print("item['assessment_en']: ", item['assessment_en'])

            item['career_en'] = assessCareerDict.get('career_en').strip()
            print("item['career_en']: ", item['career_en'])

            # 申请要求
            apply_desc_en = response.xpath(
                "//div[@id='typical-offer']").extract()
            item['apply_desc_en'] = remove_class(
                clear_lianxu_space(apply_desc_en))
            # print("item['apply_desc_en']: ", item['apply_desc_en'])

            require_chinese_en = """<h2 id="ugentryreqs">Entry requirements for undergraduate courses</h2>
<p>You can apply for undergraduate programmes either through the&nbsp;<a href="http://www.ucas.com/">Universities and Colleges Admissions Service</a>&nbsp;(UCAS) or&nbsp;<a href="http://www.commonapp.org/">The Common Application.</a>&nbsp;Please use only&nbsp;<strong>one</strong>&nbsp;method of applying. If you are using UCAS to apply for other UK universities, please also make your University of Bristol application through UCAS and do not use the Common Application.The UCAS code name and number for this University is BRISL B78.</p>
<p>Individual course entry requirements&nbsp;are listed in our <a href="http://www.bris.ac.uk/study/undergraduate/">Undergraduate Prospectus</a>&nbsp;for each course.</p>
<ul>
<li>Applicants with the Gaozhong Biye Zhengshu (Senior High School Certificate) and Gaokao&nbsp;(Chinese University entrance exam) combined with a successfully completed appropriate <a href="http://www.bris.ac.uk/english-language/study/ifp/" target="_blank">Foundation programme</a> will be considered for admission to our Bachelor's degree courses.</li>
<li>Applicants who have successfully completed the first year of a Chinese University degree at a prestigious university will be considered for admission to the first year of our Bachelor's degree courses.</li>
<li>Applicants will be required to meet the English language requirements for the programme. The profile level requirements can be found on the&nbsp;<a href="http://www.bristol.ac.uk/study/language-requirements/">English language requirements for study</a>&nbsp;page.</li>
</ul>"""
            item["require_chinese_en"] = remove_class(require_chinese_en)
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<h2>How to apply</h2><p>You can apply through Universities and Colleges Admissions Service (UCAS) or the Common Application (Common App). For Engineering Design, Medicine, Dentistry or Veterinary Science courses, you must apply using UCAS.</p> </div> <!-- end: content - how to apply --> <!-- start: drop down - application options --> <div class="main-col-child">  <div class="dropdown"> <h3 class="dropdown-heading">Applying through UCAS</h3> <div class="dropdown-content"> <p>You can apply for a maximum of five courses using the UCAS form. Apply for medicine, dentistry and veterinary courses through UCAS by 15 October. You can only use four of your five UCAS choices to apply to these courses.</p> <p><a class="btn icon-arrow-right" data-tracking-click-url="http://uk.sitestat.com/bristol/bristol-ext/s?study.undergraduate.apply.international.index_html.international-ucas&amp;ns_type=clickout&amp;ns_url=https://www.ucas.com/ucas/undergraduate/getting-started/ucas-undergraduate-international-and-eu-students" href="https://www.ucas.com/ucas/undergraduate/getting-started/ucas-undergraduate-international-and-eu-students">Apply online through UCAS</a><br />Our UCAS institution code is <strong>BRISL B78</strong>.</p> <p>After you have applied, UCAS will give you a ten-digit personal ID number. You will need this if you contact the University about your application.</p> <h4>Entering your qualifications</h4> <p>Before you submit your UCAS application, make sure you have included:</p> <ul> <li><strong>Full details of qualifications you have already taken</strong>: include grades/marks for the academic qualifications you've achieved from age 16 (GCSE or equivalent), and any English language qualifications.</li> <li><strong>Full details of the qualifications you are taking:</strong> include current studies (name and expected date of examination and major subjects), English language qualifications, and any resits of previous qualifications you expect to take.</li> </ul> <p>If your qualification offers different levels of study, state which subjects you are studying at the higher level, and which at the standard level.</p> <p>Watch the <a href="https://www.ucas.com/connect/videos?v=/apply-education-page">UCAS how-to guide on entering qualifications</a>.</p> <h4>When to apply</h4> <p>Find the <a href="https://www.ucas.com/ucas/undergraduate/apply-track/when-apply"><span>application deadlines on the UCAS website</span></a>.</p> </div>   <h3 class="dropdown-heading" >Applying through the Common App</h3> <div class="dropdown-content"> <p>You can use Common App to apply for any full-time undergraduate course at Bristol, except Engineering Design, Medicine, Dentistry or Veterinary Science courses. The deadline for applying through Common App is 30 June 2018.</p> <p><a class="btn icon-arrow-right" data-tracking-click-url="http://uk.sitestat.com/bristol/bristol-ext/s?study.undergraduate.apply.international.index_html.international-common&amp;ns_type=clickout&amp;ns_url=https://www.commonapp.org/" href="https://www.commonapp.org/">Apply online through the Common App</a></p> <p>After you have applied, you will be given an application number. You will need this if you contact the University about your application.</p> </div>   <h3 class="dropdown-heading" >Applying for direct entry courses</h3> <div class="dropdown-content"> <p>These are our direct entry courses. Please apply using these links and not through UCAS:</p> <ul> <li><a href="/dental/courses/dcp/hygiene/apply/">Diploma in Dental Hygiene</a></li> <li><a href="http://www.bristol.ac.uk/english-language/study/ifp/apply/">International Foundation Programme</a></li> <li><a href="/arts/study/foundation/apply/">Foundation in Arts and Humanities</a></li></ul> """
                ]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])
            yield item
        except Exception as e:
            print("异常:", str(e))
            print("报错链接:", response.url)
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
示例#27
0
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Bolton"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        print("subjectArea===: ", response.meta['subjectArea'])
        try:
            programmeDegreetype = response.xpath(
                "//div[@class='wpb_text_column wpb_content_element  vc_custom_1506499626241']/div[@class='wpb_wrapper']/h2//text()"
            ).extract()
            # print("programmeDegreetype: ", programmeDegreetype)
            programmeDegreetypeStr = ''.join(programmeDegreetype).strip()

            degree_type = response.xpath(
                "//li[@class='iconim award']//b[contains(text(),'Award:')]/..//text()"
            ).extract()
            # print("degree_type: ", degree_type)
            item['degree_name'] = ''.join(degree_type).replace(
                "Award:", "").replace("(Hons)", "").strip()
            # if item['degree_name'] == "":
            #     item['degree_name'] = "**"
            print("item['degree_name']: ", item['degree_name'])

            # if item['degree_name'].lower() == "phd":
            #     item['teach_type'] = 'phd'
            #     item['degree_type'] = 3
            # print("item['teach_type']: ", item['teach_type'])
            # print("item['degree_type']: ", item['degree_type'])

            programme = programmeDegreetypeStr.replace(
                item['degree_name'], '').replace("(Hons)",
                                                 "").replace("()", "").strip()
            item['programme_en'] = programme
            print("item['programme_en']: ", item['programme_en'])

            start_date = response.xpath(
                "//li[@class='iconim date']//b[contains(text(),'Start date:')]/..//text()"
            ).extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            start_date_str = ''.join(start_date).replace("Start date:",
                                                         "").strip()
            # print("start_date_str: ", start_date_str)
            start_date_re = re.findall(r"\d+/\d+/\d+", start_date_str)
            # print("start_date_re: ", start_date_re)
            if len(start_date_re) > 0:
                for s in start_date_re:
                    start_date_sp = s.split('/')
                    item['start_date'] += start_date_sp[
                        -1] + "-" + start_date_sp[1] + "-" + start_date_sp[
                            0] + ", "
            if item['start_date'] != None:
                item['start_date'] = item['start_date'].strip().rstrip(
                    ',').strip()
            # print("item['start_date']: ", item['start_date'])

            location = response.xpath(
                "//li[@class='iconim location']//b[contains(text(),'Location:')]/..//text()"
            ).extract()
            item['location'] = ''.join(location).replace("Location:",
                                                         "").strip()
            # print("item['location']: ", item['location'])

            duration = response.xpath(
                "//li[@class='iconim duration']//b[contains(text(),'Duration:')]/..//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            overview_en = response.xpath(
                "//div[@id='course-details']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # print("item['overview_en']: ", item['overview_en'])

            # //div[@id='course-detail']
            entry_requirements = response.xpath(
                "//div[@id='entry-requirements']//text()").extract()
            # item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts_desc = response.xpath(
                "//div[@id='entry-requirements']//*[contains(text(),'IELTS')]/text()"
            ).extract()
            clear_space(ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            # ielts_desc_re = re.findall(r'.{1,50}IELTS.{1,50}', ''.join(ielts_desc))
            # print("ielts_desc_re: ", ielts_desc_re)
            # if len(ielts_desc) > 0:
            item['ielts_desc'] = ''.join(ielts_desc).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            career_en = response.xpath(
                "//div[@id='careers-employment']").extract()
            item['career_en'] = remove_class(
                clear_lianxu_space(career_en)).replace("<div></div>",
                                                       "").strip()
            # print("item['career_en']: ", item['career_en'])

            how_to_apply = response.xpath(
                "//div[@id='how-to-apply']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            modules = response.xpath(
                "//div[@class='tab_content modules_tab_content tab__teaching-assessment__modules']"
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath(
                "//div[@class='tab_content modules_tab_content tab__teaching-assessment__teaching-methods']"
                "|//div[@class='tab_content modules_tab_content tab__teaching-assessment__assessment-methods']"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            tuition_fee = response.xpath(
                "//h3[@class='table_header'][contains(text(),'International fees')]/following-sibling::div[1]/table//tr/th[contains(text(),'2018/')][1]/following-sibling::td[1]//text()"
            ).extract()
            print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            department_dict = {
                "Art & Design and Fine Art":
                "Bolton School of the Arts",
                "Textiles & Fashion":
                "Bolton School of the Arts",
                "Media & Photography":
                "Bolton School of the Arts",
                "Theatre & Performance":
                "Bolton School of the Arts",
                "English & Creative Writing":
                "Bolton School of the Arts",
                "Graphic Design":
                "Bolton School of the Arts",
                "Animation & Illustration":
                "Bolton School of the Arts",
                "Accountancy":
                "Institute of Management Greater Manchester",
                "Business, Retail, Logistics & Supply Chain Management":
                "Institute of Management Greater Manchester",
                "Nursing":
                "Faculty of Health & Wellbeing",
                "Health & Social Care":
                "Faculty of Health & Wellbeing",
                "Dental Sciences":
                "Faculty of Health & Wellbeing",
                "Early Years & Childhood Studies":
                "Faculty of Health & Wellbeing",
                "Community Work & Youth":
                "Faculty of Health & Wellbeing",
                "School of Sport & Biological Sciences":
                "Faculty of Health & Wellbeing",
                "Automotive Design":
                "National Centre for Motorsport Engineering",
                "Chassis Dynamics & Aerodynamics":
                "National Centre for Motorsport Engineering",
                "General Engineering":
                "National Centre for Motorsport Engineering",
                "Motorsport & Trackside Technology":
                "National Centre for Motorsport Engineering",
                "Engines & Performance Modelling":
                "National Centre for Motorsport Engineering",
                "Our Partners":
                "National Centre for Motorsport Engineering",
                "Computing":
                "School of Creative Technologies",
                "Games":
                "School of Creative Technologies",
                "Special & Visual Effects":
                "School of Creative Technologies",
                "Education & Teacher Training":
                "School of Education & Psychology",
                "Psychology":
                "School of Education & Psychology",
                "Access courses":
                "School of Education & Psychology",
                "International Foundation programmes & English Pre-Sessional courses":
                "School of Education & Psychology",
                "Construction":
                "School of Engineering",
                "Civil Engineering":
                "School of Engineering",
                "Mechanical Engineering":
                "School of Engineering",
                "Motorsport & Automotive Performance Engineering":
                "School of Engineering",
                "Biomedical & Medical Engineering":
                "School of Engineering",
                "Electrical & Electronic Engineering":
                "School of Engineering",
                "Mathematics":
                "School of Engineering",
                "Law":
                "School of Law",
                "Centre for Contemporary Coronial Law":
                "School of Law",
                "Medical Biology":
                "School of Sport & Biological Sciences",
                "Sports & Sport Rehabilitation":
                "School of Sport & Biological Sciences",
            }
            item['department'] = department_dict.get(
                response.meta['subjectArea'])
            print("item['department']: ", item['department'])

            alevel = response.xpath(
                "//li[@class='iconim points']//b[contains(text(),'UCAS points:')]/../span//text()"
            ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel']: ", item['alevel'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<p><strong>Undergraduate entry to year 1 </strong></p>
<p>The above qualifications and completion of a suitable foundation programme.</p>
<p>Alternatively, successful completion of one year at a recognised Chinese university in a relevant subject.</p>
<p><strong>Undergraduate entry to year 2 / 3</strong></p>
<p>2 Year Diploma in a suitable subject area.</p>
<p>University College Graduation Diploma or Graduation Diploma from recognised institutions.</p>
<p>EDEXCEL or SQA HND</p>
<p>Da Zhuan (3 Year Diploma)</p>
<p>(Year 2 &amp; 3 entry is subject to successful programme mapping)</p>"""
                ]))

            ucascode = response.xpath(
                "//li[@class='iconim code']//b[contains(text(),'UCAS code:')]/..//text()"
            ).extract()
            clear_space(ucascode)
            print("ucascode: ", ucascode)
            item['ucascode'] = ''.join(ucascode).replace("UCAS code:",
                                                         "").strip()
            print("item['ucascode'] = ", item['ucascode'])

            mode = response.xpath(
                "//b[contains(text(),'Course type:')]/..//text()").extract()
            clear_space(mode)
            teach_time = ''.join(mode)
            print("teach_time: ", teach_time)

            isup = response.xpath(
                "//a[contains(text(),'Click here for more information on')]//text()"
            ).extract()
            # print("isup: ", isup)
            isup_str = ''.join(isup)
            if len(isup) == 0:
                isup = response.xpath(
                    "//li[@class='iconim code']//b[contains(text(),'UCAS code:')]/..//text()"
                    "|//li[@class='iconim points']//b[contains(text(),'UCAS points:')]/..//text()"
                ).extract()
            print("isup_str: ", isup_str)
            print("isup: ", isup)
            if "full" in teach_time.lower():
                if "https://courses.bolton.ac.uk/course" in item['url']:
                    if "undergraduate" in isup_str or len(
                            item['ucascode']) != 0:
                        print("******存到数据库*****")
                        yield item

        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.lincoln.ac.uk/"
        item['university'] = "University of Lincoln"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = 'University of Lincoln, Brayford Pool, Lincoln, LN6 7TS'
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            if "Foundation" not in item['major_type1']:
                # //table[@id='newTitle']/tbody[@id='newTitleBody']/tr/td/h1[1]/a
                programmeDegreetype = response.xpath("//div[@id='CourseTitleApms']/h1//text()").extract()
                clear_space(programmeDegreetype)
                # print("programmeDegreetype: ", programmeDegreetype)
                if len(programmeDegreetype) > 0:
                    programmeDegreetypeStr = programmeDegreetype[0].strip()

                degree_type = re.findall(r"^\w+\s\(Hons\)|^\(\w+\)|^\w+", programmeDegreetypeStr)
                # print("degree_type: ", degree_type)
                degree_type_str = ''.join(degree_type).strip()
                item['degree_name'] = ''.join(degree_type).replace("(Hons)", "").replace("(", "").replace(")", "").strip()
                print("item['degree_name']: ", item['degree_name'])

                item['programme_en'] = programmeDegreetypeStr.replace(degree_type_str, '').strip()
                print("item['programme_en']: ", item['programme_en'])

                ucascode = response.xpath("//div[@class='nd_2019-20']//span[@class='blue'][contains(text(),'UCAS Code:')]/..//text()").extract()
                if len(ucascode) == 0:
                    ucascode = response.xpath("//span[@class='blue'][contains(text(),'UCAS Code:')]/..//text()").extract()
                clear_space(ucascode)
                # print("ucascode: ", ucascode)
                item['ucascode'] = ''.join(ucascode).replace("UCAS Code:", "").strip()
                # print("item['ucascode'] = ", item['ucascode'])

                # //span[@id='durationFT']
                duration = response.xpath("//div[@class='nd_2019-20']//span[contains(text(),'Full-time Duration')]/..//text()").extract()
                if len(duration) == 0:
                    duration = response.xpath("//span[contains(text(),'Full-time Duration')]/..//text()").extract()
                clear_space(duration)
                # print("duration: ", duration)
                duration_str = ''.join(duration)

                duration_list = getIntDuration(duration_str)
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
                # print("item['duration'] = ", item['duration'])
                # print("item['duration_per'] = ", item['duration_per'])

                department = response.xpath("//span[contains(text(),'School:')]/following-sibling::a//text()").extract()
                clear_space(department)
                if len(department) > 0:
                    item['department'] = department[0]
                # print("item['department']: ", item['department'])

                dep_dict = {"lincoln school of architecture and the built environment": "College of Arts",
    "lincoln school of design": "College of Arts",
    "lincoln school of film and media": "College of Arts",
    "school of english and journalism": "College of Arts",
    "school of fine and performing arts": "College of Arts",
    "school of history and heritage": "College of Arts",
    "school of chemistry": "College of Science",
    "school of computer science": "College of Science",
    "school of engineering": "College of Science",
    "school of geography": "College of Science",
    "school of life sciences": "College of Science",
    "school of mathematics and physics": "College of Science",
    "school of pharmacy": "College of Science",
    "national centre for food manufacturing": "College of Science",
    "lincoln institute for agri-tech": "College of Science",
    "school of education": "College of Social Science",
    "school of health and social care": "College of Social Science",
    "professional development centre": "College of Social Science",
    "lincoln law school": "College of Social Science",
    "school of psychology": "College of Social Science",
    "school of social and political sciences": "College of Social Science",
    "school of sport and exercise science": "College of Social Science",}
                if item['department'] != "Lincoln Business School":
                    item['department'] = dep_dict.get(item['department'].lower())
                # print("item['department']1: ", item['department'])

                if item['department'] == None:
                    item['department'] = ''.join(response.xpath("//div[@class='breadcrumb-list']//span//a[@href='/home/collegeofsocialscience/']//text()").extract()).strip()
                # print("item['department']2: ", item['department'])

                # //div[@id='feesTables']/table
                fee = response.xpath("//div[@class='nd_2019-20']//div[@class='panel-body']//table[2]//td[contains(text(),'Full-time')]/following-sibling::*[last()]//text()").extract()
                if len(fee) == 0:
                    fee = response.xpath(
                        "//div[@class='panel-body']//table[2]//td[contains(text(),'Full-time')]/following-sibling::*[last()]//text()").extract()
                clear_space(fee)
                # print("fee: ", fee)
                feeStr = ''.join(fee)
                tuitionfee = getTuition_fee(feeStr)
                item['tuition_fee'] = tuitionfee
                if item['tuition_fee'] == 0:
                    item['tuition_fee'] = None
                # print("item['tuition_fee']: ", item['tuition_fee'])

                # //h2[contains(text(),'The Course')]/..
                overview = response.xpath("//h2[contains(text(),'The Course')]/..").extract()
                # print("overview: ", overview)
                if len(overview) > 0:
                    item['overview_en'] = remove_class(clear_lianxu_space([overview[-1]]))
                # print("item['overview_en']: ", item['overview_en'])

                modules_en = response.xpath("//a[contains(text(),'Modules')]/../../..").extract()
                modules_en = response.xpath(
                    "//div[@id='collapse62019-20']//div[@class='tab-content clearfix']").extract()

                if len(modules_en) > 0:
                    item['modules_en'] = remove_class(clear_lianxu_space([modules_en[-1]]))
                if item['modules_en'] == "":
                    item['modules_en'] = None
                    print("*** modules_en")
                else:
                    print("===", item['modules_en'])
                    del_cont = re.findall(r"<br>Find out more</p><div><span>.*?</em></span>", item['modules_en'])
                    print("del_cont==", del_cont)
                    if len(del_cont) > 0:
                        for delc in del_cont:
                            item['modules_en'] = item['modules_en'].replace(delc, '<div>').strip()
                print("item['modules_en']: ", item['modules_en'])

                assessment_en = response.xpath(
                    "//a[contains(text(),'How You Are Assessed')]/../../..|//a[contains(text(),'How you are assessed')]/../../..").extract()
                if len(assessment_en) > 0:
                    item['assessment_en'] = remove_class(clear_lianxu_space([assessment_en[-1]]))
                # print("item['assessment_en']: ", item['assessment_en'])

                interview_desc_en = response.xpath(
                    "//a[contains(text(),'Interviews & Applicant Days')]/../../..").extract()
                if len(interview_desc_en) > 0:
                    item['interview_desc_en'] = remove_class(clear_lianxu_space([interview_desc_en[-1]]))
                # print("item['interview_desc_en']: ", item['interview_desc_en'])

                alevel = response.xpath(
                    "//*[contains(text(),'GCE Advanced Levels')]/text()|//*[contains(text(),'A Level')]/text()").extract()
                if len(alevel) > 0:
                    item['alevel'] = clear_lianxu_space([alevel[-1]])
                print("item['alevel']: ", item['alevel'])

                ib = response.xpath(
                    "//p[contains(text(),'International Baccalaureate')]").extract()
                if len(ib) > 0:
                    item['ib'] = remove_tags(clear_lianxu_space([ib[-1]]))
                # print("item['ib']: ", item['ib'])

                rntry_requirements = response.xpath(
                    "//a[contains(text(),'Entry Requirements')]/../../..|//a[contains(text(),'Entry requirements')]/../../..").extract()
                if len(rntry_requirements) > 0:
                    rntry_requirements = remove_tags(clear_lianxu_space([rntry_requirements[-1]]))
                # print("rntry_requirements: ", rntry_requirements)

                ielts = re.findall(r"IELTS.{1,80}", rntry_requirements)
                item['ielts_desc'] = ''.join(ielts).strip()
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ielts_dict = get_ielts(item['ielts_desc'])
                item['ielts'] = ielts_dict.get('IELTS')
                item['ielts_l'] = ielts_dict.get('IELTS_L')
                item['ielts_s'] = ielts_dict.get('IELTS_S')
                item['ielts_r'] = ielts_dict.get('IELTS_R')
                item['ielts_w'] = ielts_dict.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                career = response.xpath("//div[@id='CourseCareersApms']").extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # print("item['career_en']: ", item['career_en'])

                # if item['ielts_desc'] == "":
                #     item['ielts_desc'] = "Prospective students require IELTS 6.0 (with no less than 5.5 in each band score) or an equivalent qualification. Please note that some courses require a higher score."
                #     item['ielts'] = 6.0
                #     item['ielts_l'] = 5.5
                #     item['ielts_s'] = 5.5
                #     item['ielts_r'] = 5.5
                #     item['ielts_w'] = 5.5
                # print("******item['ielts_desc']: ", item['ielts_desc'])
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                # http://www.lincoln.ac.uk/home/studywithus/internationalstudents/englishlanguagerequirementsandsupport/englishlanguagerequirements/
                if item['ielts'] == "6.5":
                    item['toefl'] = 90
                    item['toefl_l'] = 20
                    item['toefl_s'] = 22
                    item['toefl_r'] = 21
                    item['toefl_w'] = 22
                elif item['ielts'] == "7.0":
                    item['toefl'] = 100
                    item['toefl_l'] = 22
                    item['toefl_s'] = 23
                    item['toefl_r'] = 23
                    item['toefl_w'] = 23
                # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
                #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

                # http://www.lincoln.ac.uk/home/studywithus/internationalstudents/entryrequirementsandyourcountry/china/
                item["require_chinese_en"] = remove_class(clear_lianxu_space(["""<div class="panel">
<div class="panel-heading">
<h4 class="panel-title">
<a data-toggle="collapse" href="#countryUndergraduateTab">
<em class="more-less glyphicon glyphicon-menu-down"></em>Undergraduate Entry
</a>
</h4>
</div>
<div id="countryUndergraduateTab" class="panel-collapse collapse">
<div class="panel-body">
<p>Prospective students require one of the following qualifications for entry into year one of an undergraduate degree:</p>
<ul>
<li>Successful completion of a Foundation programme with a minimum of 50% plus an average of 70% or above in High School. Please note that some programmes may require a higher foundation score e.g. 60%.</li>
<li>Successful completion of the first year of a Chinese degree / Diploma with an average grade of 70% or above.</li>
</ul>
<p><strong>&nbsp;</strong></p>
<p><strong>HND Students (BTEC and SQA)</strong></p>
<p>Students who have successfully completed a HND BTEC or SQA qualification may be accepted directly into year two or three of a University of Lincoln undergraduate course on a case by case basis.</p>
<p><strong>Chinese Degree / Diploma</strong></p>
<p>Students who have successfully completed the second or third year of a Chinese Degree or Diploma may be considered for direct entry into year two or three of a University of Lincoln undergraduate course on a case by case basis. For more information, please contact the International Admissions team:&nbsp;<a href="mailto:intadmissions&#64;lincoln&#46;ac&#46;uk">intadmissions&#64;lincoln&#46;ac&#46;uk</a>.</p>
<p>&nbsp;</p>	
<!-- START ADVANCED ENTRY (UNDERGRADUATE) -->
<p><strong>Advanced Entry (Undergraduate)</strong></p>
<p>Depending on your academic background and intended course of study, it may be possible to apply for advanced entry into year 2 or 3 of a University of Lincoln undergraduate course.</p>

<!-- START COUNTRY SPECIFIC ADVANCED ENTRY (UNDERGRADUATE) -->


<!-- END COUNTRY SPECIFIC ADVANCED ENTRY (UNDERGRADUATE) -->

<p id="advEntryUgEu">For more information, please contact the Student Administration Team: <a href="mailto:[email protected]">[email protected]</a>.</p>
<p id="advEntryUgInternational">For more information, please contact the International Admissions Team: <a href="mailto:[email protected]">[email protected]</a>.</p>
<!-- END ADVANCED ENTRY (UNDERGRADUATE) -->
</div>
</div>					
</div>
"""]))
                # print("item['require_chinese_en']: ", item['require_chinese_en'])

                item['apply_proces_en'] = "http://www.lincoln.ac.uk/home/studywithus/undergraduatestudy/howtoapply/"
                # print("item['apply_proces_en']: ", item['apply_proces_en'])
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
示例#29
0
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Surrey"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item[
            'location'] = '01SE01, Senate House, University of Surrey, Guildford, Surrey GU2 7XH'
        # print("item['location'] = ", item['location'])
        print("===============================")
        print(response.url)
        try:
            overview = response.xpath(
                "//h3[contains(text(),'Course facts')]/../preceding-sibling::*"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en'] = ", item['overview_en'])

            career = response.xpath(
                "//h2[contains(text(),'Careers')]/preceding-sibling::*[1]/following-sibling::*[position()<last()-3]"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # if item['career_en'] == "":
            #     print("***career_en")
            # print("item['career_en'] = ", item['career_en'])

            modules = response.xpath(
                "//div[@class='module-list']/following-sibling::*[1]/preceding-sibling::*"
            ).extract()
            # modules1 = response.xpath("//div[@id='modules-ft']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # if item['modules_en'] == "":
            #     print("***modules_en")
            # print("item['modules_en'] = ", item['modules_en'])

            assessment_en = response.xpath(
                "//h2[contains(text(),'Teaching')]/preceding-sibling::*[1]/following-sibling::*[position()<7]|"
                "//h2[contains(text(),'Assessment')]/preceding-sibling::*[1]/following-sibling::*[position()<3]"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # if item['assessment_en'] == "":
            #     print("***assessment_en")
            # print("item['assessment_en'] = ", item['assessment_en'])

            # //a[contains(text(), 'Faculty of')]|//a[contains(text(), 'School of')]
            department = response.xpath(
                "//a[contains(text(), 'Faculty of')]//text()|"
                "//a[contains(text(), 'School of')]//text()").extract()
            item['department'] = remove_class(
                clear_lianxu_space(department)).replace(
                    "academic staff in the", "").strip()
            # if item['department'] == "":
            #     print("***department")
            # print("item['department'] = ", item['department'])

            entry_requirements = response.xpath(
                "//div[@id='entry-collapse']").extract()
            item['apply_desc_en'] = remove_class(
                clear_lianxu_space(entry_requirements))
            # print("item['apply_desc_en'] = ", item['apply_desc_en'])

            alevel = response.xpath(
                "//h3[contains(text(),'A-level')]/following-sibling::*[1]//text()"
            ).extract()
            alevel_str = ''.join(alevel).strip()
            if alevel_str == "Overall:" or alevel_str == "Overall":
                alevel = response.xpath(
                    "//h3[contains(text(),'A-level')]/following-sibling::*[position()<4]//text()"
                ).extract()
                alevel_str = ''.join(alevel).replace(
                    "Overall", "").strip().strip(":").strip()
                # print("***alevel")
            item['alevel'] = clear_space_str(alevel_str)
            # print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//h3[contains(text(),'International Baccalaureate')]/following-sibling::*[1]//text()"
            ).extract()
            ib_str = ''.join(ib).strip()
            if ib_str == "Overall:":
                ib = response.xpath(
                    "//h3[contains(text(),'International Baccalaureate')]/following-sibling::*[2]//text()"
                ).extract()
                ib_str = ''.join(ib).strip()
                # print("***ib")
            item['ib'] = ib_str
            # print("item['ib'] = ", item['ib'])

            ielts_str = response.xpath(
                "//div[@id='entry-collapse']//h2[contains(text(),'English')]/following-sibling::p[position()<4]//text()"
            ).extract()
            ielts_re = re.findall(r"^IELTS.{1,80}", ''.join(ielts_str))
            item['ielts_desc'] = ''.join(ielts_re).strip()
            # print("item['ielts_desc'] = ", item['ielts_desc'])

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['IELTS'] = %sitem['IELTS_L'] = %sitem['IELTS_S'] = %sitem['IELTS_R'] = %sitem['IELTS_W'] = %s==" % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            application_open_date = response.xpath(
                "//div[@class='p-3 p-xl-4 text-center text-light']//text()"
            ).extract()
            clear_space(application_open_date)
            # print("application_open_date: ", ''.join(application_open_date))
            item['application_open_date'] = getStartDate(
                ''.join(application_open_date))
            # if item['application_open_date'] == "":
            #     print("***application_open_date")
            # print("item['application_open_date'] = ", item['application_open_date'])

            tuition_fee = response.xpath(
                "//div[@id='fees']//tbody//tr[1]/td[last()-1]//text()"
            ).extract()
            # print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))

            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
            else:
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee'] = ", item['tuition_fee'])
            print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """ <h2>Process</h2>
<ol><li>Choose the programmes you want to study. Still undecided? Search our <a href="/undergraduate">undergraduate degrees</a></li>
<li>Find out <a href="/apply/undergraduate/how-to-apply-through-ucas">how to apply through UCAS</a></li>
<li>Wait for universities to make their decisions, <a href="/apply/undergraduate/after-you-apply">learn what happens after you apply</a></li>
<li>Reply to your <a href="/apply/undergraduate/your-offer">university offers</a></li>
<li><a href="/apply/undergraduate/your-offer">Confirm your university place</a></li>
</ol>"""
                ]))
            # print("item['apply_proces_en'] = ", item['apply_proces_en'])

            # https://www.surrey.ac.uk/china/entry-requirements
            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h2>Undergraduate</h2>
<p>We do not accept the Chinese National University Entrance Examination. However, you can apply to study for an <a href="http://isc.surrey.ac.uk/programmes/international-foundation-year?ch=uniweb&amp;cc=uniweb&amp;cid=uniweb&amp;utm_source=signposting&amp;utm_medium=signposting&amp;utm_campaign=uniweb&amp;_ga=2.246594701.825790074.1509959240-87246970.1500115796">International Foundation Year</a> at our <a href="http://isc.surrey.ac.uk/">International Study Centre</a>, which will prepare you for a full undergraduate degree course.</p>"""
                ]))

            # 专业、学位类型
            programme_en = response.xpath(
                "//h1[@class='text-center my-0']//text()").extract()
            programme_en_str = (''.join(programme_en).split("–"))[0].strip()
            print(programme_en_str)

            if "2019" in ''.join(programme_en):
                item['start_date'] = '2019'
            # print("item['start_date'] = ", item['start_date'])

            # 判断可以拆分几条数据,ucascode、duration、degree_name
            is_degree_name = response.xpath(
                "//tbody[@class='w-100']/tr").extract()
            # print("is_degree_name: ", is_degree_name)
            print(len(is_degree_name))
            for i in range(len(is_degree_name)):
                print("****************" + str(i + 1) + "***************")
                degree_name_re = re.findall(r"\w+\s\(Hons\).*|\w+$",
                                            programme_en_str)
                if len(degree_name_re) > 0:
                    item['degree_name'] = ''.join(degree_name_re).strip()
                    item['programme_en'] = programme_en_str.replace(
                        item['degree_name'], '').strip()
                else:
                    item['programme_en'] = programme_en_str
                print("item['programme_en'] = ", item['programme_en'])

                degree_name_xpath = response.xpath(
                    "//tbody[@class='w-100']//tr[" + str(i + 1) +
                    "]/td[1]//text()").extract()
                clear_space(degree_name_xpath)
                item['degree_name'] = ''.join(degree_name_xpath).strip()
                print("item['degree_name'] = ", item['degree_name'])

                duration = response.xpath("//tbody[@class='w-100']//tr[" +
                                          str(i + 1) +
                                          "]/td[2]//text()").extract()
                clear_space(duration)
                # print("duration: ", duration)
                if len(duration) != 0:
                    duration_list = getIntDuration(''.join(duration))
                    # print("duration_list: ", duration_list)
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                print("item['duration'] = ", item['duration'])
                print("item['duration_per'] = ", item['duration_per'])

                ucascode = response.xpath("//tbody[@class='w-100']//tr[" +
                                          str(i + 1) +
                                          "]/td[4]//text()").extract()
                clear_space(ucascode)
                item['ucascode'] = ''.join(ucascode).strip()
                print("item['ucascode']: ", item['ucascode'])

                tick = response.xpath(
                    "//tbody[@class='w-100']//tr[" + str(i + 1) +
                    "]/td[3]//i[@class='icon icon-tick']").extract()
                clear_space(tick)
                print("tick: ", tick)
                print(len(tick))
                if len(tick) == 1:
                    item['other'] = 'Professional Training'
                print("item['other']: ", item['other'])
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)
    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Plymouth"
        item['url'] = response.meta['url']
        print("===========================")
        print(response.url)
        print(response.meta['url'])
        try:
            entry = response.xpath("//div[@id='entry-requirements-accordion']//text()").extract()
            clear_space(entry)
            print("entry: ", entry)

            if "A level" in entry:
                alevel_index = entry.index("A level")
                item['alevel'] = entry[alevel_index+1]
            elif "A level/AS level" in entry:
                alevel_index = entry.index("A level/AS level")
                item['alevel'] = entry[alevel_index + 1]
            elif "A levels" in entry:
                alevel_index = entry.index("A levels")
                item['alevel'] = entry[alevel_index + 1]
            elif "A levels:" in entry:
                alevel_index = entry.index("A levels:")
                item['alevel'] = entry[alevel_index + 1].strip()
            elif "A Level:" in entry:
                alevel_index = entry.index("A Level:")
                item['alevel'] = entry[alevel_index + 1]
            elif "A level:" in entry:
                alevel_index = entry.index("A level:")
                item['alevel'] = entry[alevel_index + 1]
            else:
                item['alevel'] = None
                alevel_index = 0
            if item['alevel'] == ":" or item['alevel'] == '':
                item['alevel'] = entry[alevel_index + 2]



            if "International Baccalaureate" in entry:
                ib_index = entry.index("International Baccalaureate")
                item['ib'] = entry[ib_index + 1]
            elif "International baccalaureate" in entry:
                ib_index = entry.index("International baccalaureate")
                item['ib'] = entry[ib_index + 1]
            elif "International baccalaureates" in entry:
                ib_index = entry.index("International baccalaureates")
                item['ib'] = entry[ib_index + 1]
            elif "International baccalaureates:" in entry:
                ib_index = entry.index("International baccalaureates:")
                item['ib'] = entry[ib_index + 1]
            elif "International baccalaureate:" in entry:
                ib_index = entry.index("International baccalaureate:")
                item['ib'] = entry[ib_index + 1]
            elif "International Baccalaureate:" in entry:
                ib_index = entry.index("International Baccalaureate:")
                item['ib'] = entry[ib_index + 1]
            elif "IB" in entry:
                ib_index = entry.index("IB")
                item['ib'] = entry[ib_index + 1]
            elif "IB:" in entry:
                ib_index = entry.index("IB:")
                item['ib'] = entry[ib_index + 1]
            else:
                item['ib'] = None
                ib_index = 0
            if item['ib'] == ":" or item['ib'] == '':
                item['ib'] = entry[ib_index + 2]

            # alevel = response.xpath(
            #     "//b[contains(text(),'A Level:')]/..//text()|//b[contains(text(),'A level:')]/..//text()").extract()
            # item['alevel'] = clear_lianxu_space(alevel)
            print("item['alevel']: ", item['alevel'])

            # ib = response.xpath(
            #     "//b[contains(text(),'Baccalaureate:')]/..//text()").extract()
            # if len(ib) == 0:
            #     ib = response.xpath(
            #         "//b[contains(text(),'International Baccalaureate')]/..//text()").extract()
            # item['ib'] = clear_lianxu_space(ib)
            print("item['ib']: ", item['ib'])

            yield item

        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常:", str(e))
            print("报错url:", response.url)