Exemplos de getStartDate em Python, exemplos de scrapySchool_England_Ben.getStartDate.getStartDate em Python

Exemplo n.º 1

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "St George's, University of London"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = "Cranmer Terrace, London SW17 0RE"
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            programmeDegree_name = response.xpath(
                "//div[@class='inner']/h1//text()").extract()
            programmeDegree_nameStr = ''.join(programmeDegree_name).strip()
            # print("programmeDegree_nameStr: ", programmeDegree_nameStr)

            if "Foundation" not in programmeDegree_nameStr:
                degree_name = re.findall(r"\(.*\)$|\w+\s\(.*\)$|\w+$",
                                         programmeDegree_nameStr)
                degree_name_str = ''.join(degree_name).strip()
                item['degree_name'] = degree_name_str.replace("(", "").replace(
                    ")", "").replace("Hons", "").strip()
                print("item['degree_name']: ", item['degree_name'])

                programme = programmeDegree_nameStr.replace(
                    degree_name_str, "").strip()
                item['programme_en'] = programme
                print("item['programme_en']: ", item['programme_en'])

                ucascode = response.xpath(
                    "//*[contains(text(),'UCAS code')]//text()").extract()
                clear_space(ucascode)
                # print("ucascode: ", ucascode)
                if len(ucascode) > 0:
                    ucascode_re = re.findall(r"UCAS\scode\s\w{4}",
                                             ''.join(ucascode))
                    # print("ucascode_re: ", ucascode_re)
                    item['ucascode'] = ''.join(ucascode_re).replace(
                        "UCAS code", "").strip()
                # print("item['ucascode'] = ", item['ucascode'])

                other = response.xpath(
                    "//img[@alt='globe']/../..//text()").extract()
                if len(other) == 0:
                    other = response.xpath(
                        "//td[contains(text(),'Open to UK and EU students. Not currently open to ')]//text()"
                    ).extract()
                item['other'] = clear_lianxu_space(other)
                # print("item['other'] = ", item['other'])

                # start_date = response.xpath("//dt[contains(text(), 'Start date')]/following-sibling::dd[1]//text()").extract()
                # clear_space(start_date)
                # # print("start_date: ", start_date)
                # item['start_date'] = getStartDate(''.join(start_date))
                # # print("item['start_date']: ", item['start_date'])

                duration = response.xpath(
                    "//img[@alt='Calendar']/../following-sibling::td//text()"
                ).extract()
                if len(duration) == 0:
                    duration = response.xpath(
                        "//img[@alt='Calendar']/../../following-sibling::td//text()"
                    ).extract()
                clear_space(duration)
                # print("duration: ", ''.join(duration))

                duration_list = getIntDuration(''.join(duration))
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
                # print("item['duration'] = ", item['duration'])
                # print("item['duration_per'] = ", item['duration_per'])

                # //p[contains(text(),'Non-UK/EU (International) application deadline')]
                deadline = response.xpath(
                    "//*[contains(text(),'Application deadline')]//text()|//*[contains(text(),'UCAS deadline')]//text()"
                ).extract()
                clear_space(deadline)
                # print("deadline: ", deadline)
                item['deadline'] = getStartDate(''.join(deadline).replace(
                    "Application deadline",
                    "").replace("is", "").replace("UCAS deadline",
                                                  "").replace(":", "").strip())
                if "2018" not in item['deadline'] and item[
                        'deadline'] != "" and "2019" not in item['deadline']:
                    item['deadline'] = ''.join(deadline).replace(
                        "Application deadline",
                        "").replace("is", "").replace("UCAS deadline",
                                                      "").replace(":",
                                                                  "").strip()
                # print("item['deadline']: ", item['deadline'])

                # location = response.xpath("//*[contains(text(),'Study location:')]//text()").extract()
                # item['location'] = ''.join(location).replace("Study location:", "").strip()
                # print("item['location']: ", item['location'])

                tuition_fee = response.xpath(
                    "//h3[contains(text(),'International (Non-EU) Student Fees')]/following-sibling::table//td[contains(text(),'2019/20')]/following-sibling::td[1]//text()|"
                    "//table//p[contains(text(),'2018 entry Non-EU')]//text()|"
                    "//table[2]/tbody/tr[4]/td/p[contains(text(),'2018 Non-EU')]/following-sibling::*/*[1]//text()|"
                    "//table//p[contains(text(),'2018 Non-EU')]/following-sibling::*[1]/*[1]//text()"
                ).extract()
                clear_space(tuition_fee)
                # print("tuition_fee: ", ''.join(tuition_fee))
                tuition_fee_re = re.findall(r"\d+,\d+", ''.join(tuition_fee))
                if len(tuition_fee_re) > 0:
                    item['tuition_fee'] = getTuition_fee(
                        ''.join(tuition_fee_re))
                # print("item['tuition_fee']: ", item['tuition_fee'])

                overview_en = response.xpath(
                    "//p[@class='first']|//table[1]/following-sibling::*[position()<last()-1]"
                ).extract()
                item['overview_en'] = remove_class(
                    clear_lianxu_space(overview_en)).replace(
                        "<p><button>Make an enquiry</button></p>", "").strip()
                # print("item['overview_en']: ", item['overview_en'])

                entry_url = response.xpath(
                    "//a[contains(text(),'Entry')]/@href").extract()
                # print("entry_url: ", entry_url)
                if len(entry_url) != 0:
                    parse_entry_url = "https://www.sgul.ac.uk" + entry_url[0]
                    # print("parse_entry_url: ", parse_entry_url)
                    entry_dict = self.parse_rntry_requirements(parse_entry_url)
                    # print(entry_dict)
                    # item['rntry_requirements'] = entry_dict.get('rntry_requirements')

                    item['ielts_desc'] = entry_dict.get('ielts_desc')
                    item['alevel'] = entry_dict.get('alevel')
                    item['ib'] = entry_dict.get('ib')
                # print("item['ielts_desc']: ", item['ielts_desc'])
                # print("item['alevel']: ", item['alevel'])
                # print("item['ib']: ", item['ib'])

                ielts_list = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
                if len(ielts_list) == 1:
                    item['ielts'] = ielts_list[0]
                    item['ielts_l'] = ielts_list[0]
                    item['ielts_s'] = ielts_list[0]
                    item['ielts_r'] = ielts_list[0]
                    item['ielts_w'] = ielts_list[0]
                elif len(ielts_list) == 2:
                    item['ielts'] = ielts_list[0]
                    item['ielts_l'] = ielts_list[1]
                    item['ielts_s'] = ielts_list[1]
                    item['ielts_r'] = ielts_list[1]
                    item['ielts_w'] = ielts_list[1]
                elif len(ielts_list) == 5:
                    item['ielts'] = ielts_list[0]
                    item['ielts_l'] = ielts_list[1]
                    item['ielts_s'] = ielts_list[4]
                    item['ielts_r'] = ielts_list[2]
                    item['ielts_w'] = ielts_list[3]
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                modules_url = response.xpath(
                    "//a[contains(text(),'Module')]/@href").extract()
                # print("modules_url: ", modules_url)
                if len(modules_url) != 0:
                    parse_modules_url = "https://www.sgul.ac.uk" + modules_url[
                        0]
                    # print("parse_modules_url: ", parse_modules_url)
                    item['modules_en'] = remove_class(
                        clear_lianxu_space(
                            self.parse_modules(parse_modules_url))).strip()
                # print("item['modules_en']: ", item['modules_en'])

                assessment_en_url = response.xpath(
                    "//a[contains(text(),'Studying')]/@href").extract()
                # print("assessment_en_url: ", assessment_en_url)
                if len(assessment_en_url) != 0:
                    parse_assessment_en_url = "https://www.sgul.ac.uk" + assessment_en_url[
                        0]
                    # print("parse_assessment_en_url: ", parse_assessment_en_url)
                    item['assessment_en'] = remove_class(
                        clear_lianxu_space(
                            self.parse_assessment_en(
                                parse_assessment_en_url))).strip()
                # print("item['assessment_en']: ", item['assessment_en'])

                career_en_url = response.xpath(
                    "//a[contains(text(),'Career')]/@href").extract()
                # print("career_en_url: ", career_en_url)
                if len(career_en_url) != 0:
                    parse_career_en_url = "https://www.sgul.ac.uk" + career_en_url[
                        0]
                    # print("parse_career_en_url: ", parse_career_en_url)
                    item['career_en'] = remove_class(
                        clear_lianxu_space(
                            self.parse_career_en(
                                parse_career_en_url))).replace(
                                    "<p><img></p>", "").strip()
                # print("item['career_en']: ", item['career_en'])

                apply_proces_en_url = response.xpath(
                    "//a[contains(text(),'Apply')]/@href|//a[contains(text(),'Application and interview')]/@href"
                ).extract()
                # print("apply_proces_en_url: ", apply_proces_en_url)
                if len(apply_proces_en_url) != 0:
                    parse_apply_proces_en_url = "https://www.sgul.ac.uk" + apply_proces_en_url[
                        0]
                    # print("parse_apply_proces_en_url: ", parse_apply_proces_en_url)
                    item['apply_proces_en'] = remove_class(
                        clear_lianxu_space(
                            self.parse_apply_proces_en(
                                parse_apply_proces_en_url))).replace(
                                    "<p><img></p>", "").strip()
                # print("item['apply_proces_en']: ", item['apply_proces_en'])

                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: UniversityOfChester_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Chester"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            programme = response.xpath(
                "//h1[@id='main-content']/text()").extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath(
                "//h1[@id='main-content']/div//text()").extract()
            item['degree_name'] = ''.join(degree_type).replace("(Hons)",
                                                               "").strip()
            print("item['degree_name']: ", item['degree_name'])

            start_date = response.xpath(
                "//select[@id='edit-date']/option//text()|//label[@for='edit-date']/following-sibling::span//text()"
            ).extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            start_date_str = ""
            if len(start_date) > 0:
                for s in start_date:
                    # start_date_str = getStartDate(s)
                    if getStartDate(s) != None:
                        start_date_str += getStartDate(s) + ", "
            item['start_date'] = start_date_str.strip().strip(',').strip()
            # print("item['start_date']: ", item['start_date'])

            mode = response.xpath(
                "//select[@id='edit-mode']//text()").extract()
            clear_space(mode)
            # item['teach_time'] = getTeachTime(''.join(mode))
            # print("mode: ", mode)

            location = response.xpath(
                "//label[@for='edit-compulsory']/following-sibling::*//text()"
            ).extract()
            item['location'] = ''.join(location).strip()
            # print("item['location']: ", item['location'])

            ucascode = response.xpath(
                "//dt[contains(text(),'UCAS Code')]/following-sibling::*//text()"
            ).extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            print("item['ucascode'] = ", item['ucascode'])

            duration = response.xpath(
                "//dt[contains(text(),'Duration')]/following-sibling::*//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            overview_en = response.xpath(
                "//h3[contains(text(),'Course overview')]/../*[position()<last()]|"
                "//div[@class='m-body__margin-bottom t-course__overview']"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # print("item['overview_en']: ", item['overview_en'])

            entry_requirements = response.xpath(
                "//div[@id='entry-international']//form[@id='courses-international-form']/preceding-sibling::*//text()"
            ).extract()
            # item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            alevel = response.xpath(
                "//td[contains(text(),'GCE A Level')]/following-sibling::*//text()"
            ).extract()
            item['alevel'] = clear_lianxu_space(alevel)
            # print("item['alevel']: ", item['alevel'])

            ib = response.xpath(
                "//td[contains(text(),'International Baccalaureate')]/following-sibling::*//text()"
            ).extract()
            item['ib'] = clear_lianxu_space(ib)
            # print("item['ib']: ", item['ib'])

            ielts_desc = response.xpath(
                "//div[@id='entry-international']//li[contains(text(),'Undergraduate:')]//text()"
            ).extract()
            clear_space(ielts_desc)
            item['ielts_desc'] = ''.join(ielts_desc).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            assessment_en = response.xpath(
                "//h3[@class='field-label'][contains(text(),'How will I be taught?')]/..|"
                "//h3[@class='field-label'][contains(text(),'How will I be assessed?')]/.."
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            tuition_fee = response.xpath(
                "//div[@class='field-fees-international']/p//text()|"
                "//p[contains(text(),'The tuition fees for international students studyi')]//text()"
            ).extract()
            print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']: ", item['tuition_fee'])
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            career_en = response.xpath(
                "//div[@id='careers-career-services']").extract()
            item['career_en'] = remove_class(
                clear_lianxu_space(career_en)).replace("<div></div>",
                                                       "").strip()
            print("item['career_en']: ", item['career_en'])

            modules = re.findall(
                r"function\sinit_drupal_core_settings\(\)\s{jQuery\.extend\(Drupal\.settings,.*}",
                response.text)
            # print("modules: ", modules)
            modules_str = ''.join(modules).replace(
                "function init_drupal_core_settings() {jQuery.extend(Drupal.settings,",
                "").strip()
            modules_dict = json.loads(modules_str)
            # print("modules_dict: ", modules_dict)
            # groupCode     modulesNid
            # print(modules_dict.get("courses"))
            if modules_dict.get('courses').get('groupCode') is not False:
                modules_json = "https://www1.chester.ac.uk/courses/modules/ajax/" + modules_dict.get(
                    'courses').get('modulesNid') + "/" + modules_dict.get(
                        'courses').get('groupCode') + "/389"
                # print("modules_json: ", modules_json)
                mdict = json.loads(requests.get(modules_json).text)
                # print("mdict: ", len(mdict))
                m = mdict[-1].get('data')
                if m != None:
                    item['modules_en'] = remove_class(clear_lianxu_space([m]))
            # print("item['modules_en']: ", item['modules_en'])

            item[
                'apply_proces_en'] = "https://www1.chester.ac.uk/undergraduate/how-apply/applying-full-time-courses"
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="field-collection-view clearfix view-mode-full">
  <h3 class="field-course-type">
    Undergraduate Study  </h3>

  <ul><li>UK foundation/pathway course with a pass mark of 50% and above.  Engineering courses require an additional mark of at least 55% in a Maths module. </li>
<li>China 3 year National Senior High School Certificate with 80% or above</li>
<li>Gaokao (College Entry Exam) with good grades </li>
<li>Dazhuan considered for entry to 3rd year UG</li>
<li>BFSU Foundation Year at 60% or above</li>
<li>Dongfang International Centre for Education Exchange Top University Foundation Course 60% or above</li>
<li>East and West International Education (EWIE)/ Wiseway Global International Foundation Certificate at 60% or above</li>
<li>Graduation Certificate from a specialised College/School (Zhongzuhan) with 80% or above</li>
</ul></div>"""
                ]))
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            # department = response.xpath("//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()").extract()
            # item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: NottinghamTrentUniversity_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.ntu.ac.uk/"
        item['university'] = "Nottingham Trent University"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===============================")
        # print(response.url)
        print(item['url'])
        try:
            # 专业、学位类型
            programme = response.xpath(
                "//h1[@class='course-heading page-heading']//text()").extract(
                )
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en'] = ", item['programme_en'])

            degree_type = response.xpath(
                "//h2[@class='js_qualification']/strong//text()").extract()
            item['degree_name'] = ''.join(degree_type)
            print("item['degree_name'] = ", item['degree_name'])

            # //div[@id='tabs-key-info']/div[@class='tab tab-1 active-tab']/p[3]/span
            location = response.xpath(
                "//span[@class='location save']//text()").extract()
            item['location'] = ''.join(location).strip()
            # print("item['location'] = ", item['location'])

            start_date = response.xpath(
                "//strong[contains(text(),'Starting:')]/following-sibling::span//text()"
            ).extract()
            # print(start_date)
            item['start_date'] = ''.join(start_date)
            # print("item['start_date'] = ", item['start_date'])
            item['start_date'] = getStartDate(item['start_date'])
            # print("item['start_date']1 = ", item['start_date'])

            # //html//div[@class='content']/div[1]/div  专业描述
            overview = response.xpath(
                "//div[@id='what-you-will-study']/preceding-sibling::*"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en'] = ", item['overview_en'])

            # modules   课程设置
            modules = response.xpath(
                "//div[@id='what-you-will-study']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en'] = ", item['modules_en'])

            # teaching_assessment   评估方式
            teaching_assessment = response.xpath(
                "//div[@id='how-youre-taught']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(teaching_assessment))
            # print("item['assessment_en'] = ", item['assessment_en'])

            # career   评估方式
            career = response.xpath(
                "//div[@id='careers-and-employability']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en'] = ", item['career_en'])

            # //div[@id='entry-requirements-1']
            entry_requirements = response.xpath(
                "//div[@id='entry-requirements-0']").extract()
            item['apply_desc_en'] = remove_class(
                clear_lianxu_space(entry_requirements))
            print("item['apply_desc_en'] = ", item['apply_desc_en'])

            # //div[@id='entry-requirements-1']
            how_to_apply = response.xpath(
                "//div[@id='how-to-apply-1']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en'] = ", item['apply_proces_en'])

            # //div[@id='how-to-apply-1']//h3[contains(text(),'Interview')]/following-sibling::p[position()<3]
            interview_desc_en = response.xpath(
                "//div[@id='how-to-apply-1']//h3[contains(text(),'Interview')]/following-sibling::p[position()<3]"
            ).extract()
            item['interview_desc_en'] = remove_class(
                clear_lianxu_space(interview_desc_en))
            # print("item['interview_desc_en'] = ", item['interview_desc_en'])

            # deadline
            deadline = response.xpath(
                "//div[@id='how-to-apply-1']//p//strong[contains(text(),'Application closing date')]/../following-sibling::p[1]//text()|//div[@id='how-to-apply-1']//h3[contains(text(),'Application deadline')]/following-sibling::p[1]//text()"
            ).extract()
            clear_space(deadline)
            # print("deadline: ", deadline)
            deadline_str = ''.join(deadline)
            item['deadline'] = getStartDate(deadline_str)
            # print("item['deadline'] = ", item['deadline'])

            alevel = response.xpath(
                "//div[@id='entry-requirements-0']//li[contains(text(),'A-levels')]//text()"
            ).extract()
            clear_space(alevel)
            # print(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[0]).strip()
            # print("item['alevel'] = ", item['alevel'])

            if len(item['alevel']) > 160:
                item['alevel'] = ''.join(item['alevel'][:161])
            # print("item['alevel']1 = ", item['alevel'])

            # https://www.ntu.ac.uk/international/scholarships-and-fees/tuition-fees
            tuition_fee = response.xpath(
                "//html//div[@id='fees-and-funding-1']//text()").extract()
            clear_space(tuition_fee)
            # print(tuition_fee)
            tuition_fee = getTuition_fee(''.join(tuition_fee))
            item['tuition_fee'] = tuition_fee
            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
            else:
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee'] = ", item['tuition_fee'])
            # print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            departmentDict = {
                "Economics with Business":
                "Nottingham Business School",
                "Animal Health and Welfare":
                "School of Animal, Rural and Environmental Sciences",
                "Applied Anthrozoology":
                "School of Animal, Rural and Environmental Sciences",
                "Biodiversity Conservation":
                "School of Animal, Rural and Environmental Sciences",
                "Endangered Species Recovery and Conservation":
                "School of Animal, Rural and Environmental Sciences",
                "Equine Health and Welfare":
                "School of Animal, Rural and Environmental Sciences",
                "Equine Performance":
                "School of Animal, Rural and Environmental Sciences",
                "Equine Performance, Health and Welfare":
                "School of Animal, Rural and Environmental Sciences",
                "Global Food Security and Development":
                "School of Animal, Rural and Environmental Sciences",
                "Architecture":
                "School of Architecture, Design and the Built Environment",
                "Architecture (ARB/RIBA Part 2)rch":
                "School of Architecture, Design and the Built Environment",
                "Building Surveying":
                "School of Architecture, Design and the Built Environment",
                "Civil Engineering":
                "School of Architecture, Design and the Built Environment",
                "Construction Management":
                "School of Architecture, Design and the Built Environment",
                "Construction Project Management (Online)":
                "School of Architecture, Design and the Built Environment",
                "Interior Architecture and Design":
                "School of Architecture, Design and the Built Environment",
                "International Real Estate Investment and Finance":
                "School of Architecture, Design and the Built Environment",
                "Planning and Development":
                "School of Architecture, Design and the Built Environment",
                "Project Management (Construction)":
                "School of Architecture, Design and the Built Environment",
                "Quantity Surveying":
                "School of Architecture, Design and the Built Environment",
                "Real Estate":
                "School of Architecture, Design and the Built Environment",
                "Structural Engineering with Management":
                "School of Architecture, Design and the Built Environment",
                "Structural Engineering with Materials":
                "School of Architecture, Design and the Built Environment",
                "Animation":
                "School of Art & Design",
                "Commercial Photography":
                "School of Art & Design",
                "Culture, Style and Fashion":
                "School of Art & Design",
                "Branding and Identity":
                "School of Art & Design",
                "Fashion Communications":
                "School of Art & Design",
                "Fashion Design":
                "School of Art & Design",
                "Fashion Knitwear Design":
                "School of Art & Design",
                "Fashion Marketing":
                "School of Art & Design",
                "Fine Art":
                "School of Art & Design",
                "Graphic Design":
                "School of Art & Design",
                "Illustration":
                "School of Art & Design",
                "International Fashion Management":
                "School of Art & Design",
                "Luxury Fashion Brand Management":
                "School of Art & Design",
                "Photography":
                "School of Art & Design",
                "Textile Design Innovation":
                "School of Art & Design",
                "Culture, Style and Fashion":
                "School of Art & Design",
                "Fashion Communications":
                "School of Art & Design",
                "Fashion Marketing":
                "School of Art & Design",
                "Fashion and Textile Design":
                "School of Art & Design",
                "Fine Art":
                "School of Art & Design",
                "Graphic Design Theory and Practice":
                "School of Art & Design",
                "International Fashion Management":
                "School of Art & Design",
                "Luxury Fashion Brand Management":
                "School of Art & Design",
                "Photography":
                "School of Art & Design",
                "PG Cert Creative Pattern Cutting (15 weeks)":
                "School of Art & Design",
                "Art and Design Professional Doctorate":
                "School of Art & Design",
                "Art and Design":
                "School of Art & Design",
                "Broadcast Journalism":
                "School of Arts and Humanities",
                "Digital and Newspaper Journalism":
                "School of Arts and Humanities",
                "Magazine Journalism":
                "School of Arts and Humanities",
                "Documentary Journalism":
                "School of Arts and Humanities",
                "Media and Globalisation":
                "School of Arts and Humanities",
                "Creative Writing":
                "School of Arts and Humanities",
                "English Literary Research":
                "School of Arts and Humanities",
                "Linguistics":
                "School of Arts and Humanities",
                "Philosophy":
                "School of Arts and Humanities",
                "History":
                "School of Arts and Humanities",
                "PGCert Museum and Heritage Development":
                "School of Arts and Humanities",
                "Holocaust and Genocide":
                "School of Arts and Humanities",
                "International Development":
                "School of Arts and Humanities",
                "English Language Teaching":
                "School of Arts and Humanities",
                "TESOL (Teaching English to Speakers of Other Languages)":
                "School of Arts and Humanities",
                "Management":
                "Nottingham Business School",
                "Management and Finance":
                "Nottingham Business School",
                "Management and Global Supply Chain Management":
                "Nottingham Business School",
                "Management and Innovation and Enterprise":
                "Nottingham Business School",
                "Management and International Business":
                "Nottingham Business School",
                "Management and Marketing":
                "Nottingham Business School",
                "Marketing":
                "Nottingham Business School",
                "Branding and Advertising":
                "Nottingham Business School",
                "Digital Marketing":
                "Nottingham Business School",
                "Management and Marketing":
                "Nottingham Business School",
                "fees, funding and scholarships":
                "Nottingham Business School",
                "Return to all courses":
                "Nottingham Business School",
                "Human resource Management":
                "Nottingham Business School",
                "Economics":
                "Nottingham Business School",
                "Economics and Investment Banking":
                "Nottingham Business School",
                "International Business":
                "Nottingham Business School",
                "International Business (Dual Award) ":
                "Nottingham Business School",
                "Management and International Business":
                "Nottingham Business School",
                "Management and International Publishing":
                "Nottingham Business School",
                "Management and Global Supply Chain Management":
                "Nottingham Business School",
                "Finance":
                "Nottingham Business School",
                "Finance and Accounting":
                "Nottingham Business School",
                "Finance and Investment Banking":
                "Nottingham Business School",
                "Management and Finance":
                "Nottingham Business School",
                "Economics and Investment Banking":
                "Nottingham Business School",
                "Entrepreneurship":
                "Nottingham Business School",
                "Project Management":
                "Nottingham Business School",
                "Management":
                "Nottingham Business School",
                "Management and International Business":
                "Nottingham Business School",
                "Marketing":
                "Nottingham Business School",
                "Branding and Advertising":
                "Nottingham Business School",
                "Finance":
                "Nottingham Business School",
                "International Business":
                "Nottingham Business School",
                "Assessment Only Route to QTS (Primary) - Non-NTU Award":
                "Nottingham Institute of Education",
                "Assessment Only Route to QTS (Secondary) - Non-NTU Award":
                "Nottingham Institute of Education",
                "Early Years Initial Teacher Training":
                "Nottingham Institute of Education",
                "Early Years Initial Teacher Training (Assessment Only) - Non-NTU Award":
                "Nottingham Institute of Education",
                "Education":
                "Nottingham Institute of Education",
                "English Language Teaching":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training (with English and Literacy)":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training (with Mathematics and Numeracy)":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training (with Science, Engineering and Technology)":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training (with Special and Inclusive Practice)":
                "Nottingham Institute of Education",
                "Primary Education":
                "Nottingham Institute of Education",
                "Primary: School-Centred Initial Teacher Training (SCITT)":
                "Nottingham Institute of Education",
                "School Direct Training Programme (Primary salaried)":
                "Nottingham Institute of Education",
                "School Direct Training Programme (Primary)":
                "Nottingham Institute of Education",
                "School Direct Training Programme (Secondary salaried)":
                "Nottingham Institute of Education",
                "School Direct Training Programme (Secondary)":
                "Nottingham Institute of Education",
                "Secondary Biology":
                "Nottingham Institute of Education",
                "Secondary Business Education":
                "Nottingham Institute of Education",
                "Secondary Chemistry":
                "Nottingham Institute of Education",
                "Secondary Computer Science with ICT":
                "Nottingham Institute of Education",
                "Secondary Education (Design and Technology)":
                "Nottingham Institute of Education",
                "Secondary Education (Physics)":
                "Nottingham Institute of Education",
                "Secondary English":
                "Nottingham Institute of Education",
                "Secondary Mathematics":
                "Nottingham Institute of Education",
                "Secondary Music":
                "Nottingham Institute of Education",
                "Special Educational Needs Coordination - National Award":
                "Nottingham Institute of Education",
                "Teaching English to Speakers of Other Languages (TESOL)":
                "Nottingham Institute of Education",
                "Corporate and Insolvency Law":
                "Nottingham Law School",
                "Dual LLM in Corporate and Insolvency Law / European and Insolvency Law":
                "Nottingham Law School",
                "General Law":
                "Nottingham Law School",
                "Health Law and Ethics":
                "Nottingham Law School",
                "Human Rights and Justice":
                "Nottingham Law School",
                "Intellectual Property Law":
                "Nottingham Law School",
                "International Financial Law":
                "Nottingham Law School",
                "International Trade and Commercial Law":
                "Nottingham Law School",
                "Oil, Gas and Mining Law":
                "Nottingham Law School",
                "Sports Law":
                "Nottingham Law School",
                "Corporate and Insolvency Law":
                "Nottingham Law School",
                "International Trade and Commercial Law":
                "Nottingham Law School",
                "Legal Practice":
                "Nottingham Law School",
                "Oil, Gas and Mining Law":
                "Nottingham Law School",
                "Biomedical Science":
                "School of Science and Technology",
                "Biomedical Science (Flexible Learning)":
                "School of Science and Technology",
                "Neuropharmacology":
                "School of Science and Technology",
                "Pharmacology":
                "School of Science and Technology",
                "Molecular Microbiology":
                "School of Science and Technology",
                "Biotechnology":
                "School of Science and Technology",
                "Molecular Cell Biology":
                "School of Science and Technology",
                "Environmental Management":
                "School of Science and Technology",
                "Biotechnology":
                "School of Science and Technology",
                "Cancer Biology":
                "School of Science and Technology",
                "Cell Biology":
                "School of Science and Technology",
                "Molecular Biology":
                "School of Science and Technology",
                "Molecular Microbiology":
                "School of Science and Technology",
                "Neuropharmacology":
                "School of Science and Technology",
                "Pharmacology":
                "School of Science and Technology",
                "Environmental Management":
                "School of Science and Technology",
                "Biomedical Science (Flexible Learning)":
                "School of Science and Technology",
                "Environmental Management":
                "School of Science and Technology",
                "Chemistry / Chemistry (Professional Practice)":
                "School of Science and Technology",
                "Pharmaceutical and Medicinal Science":
                "School of Science and Technology",
                "Pharmaceutical Analysis":
                "School of Science and Technology",
                "Analytical Chemistry":
                "School of Science and Technology",
                "Chemistry":
                "School of Science and Technology",
                "Advanced Materials Engineering":
                "School of Science and Technology",
                "Forensic Science":
                "School of Science and Technology",
                "Computer Science":
                "School of Science and Technology",
                "Cloud and Enterprise Computing":
                "School of Science and Technology",
                "IT Security":
                "School of Science and Technology",
                "Engineering (Electronics)":
                "School of Science and Technology",
                "Engineering (Cybernetics and Communications)":
                "School of Science and Technology",
                "Engineering Management":
                "School of Science and Technology",
                "Computing Systems":
                "School of Science and Technology",
                "Data Analytics for Business":
                "School of Science and Technology",
                "Computer Science":
                "School of Science and Technology",
                "Electronic Systems":
                "School of Science and Technology",
                "Online MBA with Data Analytics":
                "School of Science and Technology",
                "Mathematical Sciences":
                "School of Science and Technology",
                "Data Analytics for Business":
                "School of Science and Technology",
                "Online MBA with Data Analytics":
                "School of Science and Technology",
                "Medical and Materials Imaging":
                "School of Science and Technology",
                "Medical Imaging":
                "School of Science and Technology",
                "Physics":
                "School of Science and Technology",
                "Physics":
                "School of Science and Technology",
                "Sport Science":
                "School of Science and Technology",
                "Exercise Physiology":
                "School of Science and Technology",
                "Performance Nutrition":
                "School of Science and Technology",
                "Performance Analysis":
                "School of Science and Technology",
                "Biomechanics":
                "School of Science and Technology",
                "Sport and Exercise Psychology":
                "School of Science and Technology",
                "Psychology":
                "School of Social Sciences",
                "Applied Child Psychology":
                "School of Social Sciences",
                "sychological Research Methods":
                "School of Social Sciences",
                "Forensic Mental Health":
                "School of Social Sciences",
                "Forensic Psychology (BPS accredited)":
                "School of Social Sciences",
                "Cyberpsychology":
                "School of Social Sciences",
                "Psychology in Clinical Practice":
                "School of Social Sciences",
                "Psychological Wellbeing and Mental Health":
                "School of Social Sciences",
                "Criminology":
                "School of Social Sciences",
                "Sociology":
                "School of Social Sciences",
                "Politics":
                "School of Social Sciences",
                "International Relations":
                "School of Social Sciences",
                "Online International Relations (Distance learning)":
                "School of Social Sciences",
                "Public Health":
                "School of Social Sciences",
                "Career Development":
                "School of Social Sciences",
                "Social Work (January 2019 entry)":
                "School of Social Sciences",
            }
            item['department'] = departmentDict.get(item['programme_en'])
            if item['department'] == None:
                item['department'] = departmentDict.get(item['programme_en'])
                if item['department'] == None:
                    item['department'] = departmentDict.get(
                        item['programme_en'])
                    if item['department'] == None:
                        item['department'] = departmentDict.get(
                            item['programme_en'].replace(" ", " "))
            print("item['department'] = ", item['department'])

            # School of Animal, Rural and Environmental Sciences
            # School of Architecture, Design and the Built Environment
            # School of Art &amp; Design
            # School of Arts and Humanities
            # Nottingham Business School
            # Nottingham Institute of Education
            # Nottingham Law School
            # School of Science and Technology
            # School of Social Sciences
            if item['department'] is None:
                if "/animal-rural-environmental-sciences" in item['url']:
                    item[
                        'department'] = "School of Animal, Rural and Environmental Sciences"
                elif "/architecture-design-built-environment" in item['url']:
                    item[
                        'department'] = "School of Architecture, Design and the Built Environment"
                elif "/art-design" in item['url']:
                    item['department'] = "School of Art & Design"
                elif "/arts-humanities" in item['url']:
                    item['department'] = "School of Arts and Humanities"
                elif "/business" in item['url']:
                    item['department'] = "Nottingham Business School"
                elif "/education" in item['url']:
                    item['department'] = "Nottingham Institute of Education"
                elif "/law" in item['url']:
                    item['department'] = "Nottingham Law School"
                elif "/science-technology" in item['url']:
                    item['department'] = "School of Science and Technology"
                elif "/social-sciences" in item['url']:
                    item['department'] = "School of Social Sciences"
            print("item['department']1 = ", item['department'])

            if item['degree_name'] == "BA (Hons)":
                item['ielts'] = 7.0
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.5
                item['ielts_w'] = 6.5
            elif item['department'] == "School of Art & Design" or item[
                    'department'] == "School of Animal, Rural and Environmental Sciences" or item[
                        'department'] == "School of Science and Technology":
                item['ielts'] = 6.0
                item['ielts_l'] = 5.5
                item['ielts_s'] = 5.5
                item['ielts_r'] = 5.5
                item['ielts_w'] = 5.5
            elif item['department'] == "Nottingham Business School":
                item['ielts'] = 6.5
                item['ielts_l'] = 5.5
                item['ielts_s'] = 5.5
                item['ielts_r'] = 5.5
                item['ielts_w'] = 5.5
            elif item['department'] == "School of Architecture, Design and the Built Environment" or item[
                    'department'] == "School of Arts and Humanities" or item[
                        'department'] == "Nottingham Institute of Education" or item[
                            'department'] == "Nottingham Law School" or item[
                                'department'] == "School of Social Sciences" or item[
                                    'department'] == "School of Art & Design":
                item['ielts'] = 6.5
                item['ielts_l'] = 5.5
                item['ielts_s'] = 5.5
                item['ielts_r'] = 5.5
                item['ielts_w'] = 5.5
            # print("item['IELTS'] = %s item['IELTS_L'] = %s item['IELTS_S'] = %s item['IELTS_R'] = %s item['IELTS_W'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            entry_requirements = response.xpath(
                "//div[@id='entry-requirements-1']//text()").extract()
            entry_requirementsStr = ''.join(entry_requirements)
            ielts = re.findall(r"IELTS.{1,200}", entry_requirementsStr)
            item['ielts_desc'] = ''.join(ielts)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            if item['ielts'] == None:
                ieltsDict = get_ielts(''.join(ielts))
                item['ielts'] = ieltsDict.get("IELTS")
                item['ielts_l'] = ieltsDict.get("IELTS_L")
                item['ielts_s'] = ieltsDict.get("IELTS_S")
                item['ielts_r'] = ieltsDict.get("IELTS_R")
                item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['IELTS'] = %sitem['IELTS_L'] = %sitem['IELTS_S'] = %sitem['IELTS_R'] = %sitem['IELTS_W'] = %s==" % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            item["require_chinese_en"] = remove_class(
                clear_lianxu_space([
                    """<h2>Entry requirements</h2>
                        <table id="table76765" style="width: 100%;"><thead><tr><th id="table76765r1c1"> Your qualification</th><th id="table76765r1c2"> You could study</th></tr></thead><tbody><tr><td headers="table76765r1c1">         High School Year 2<br />Grades of 70% and above       </td><td headers="table76765r1c2">         Foundation courses at <a href="https://www.kaplanpathways.com/colleges/nottingham-trent-international-college/courses/">Nottingham Trent International College (NTIC) </a></td></tr><tr><td headers="table76765r1c1">         High School Year 3<br />Grades of 80% and above       </td><td headers="table76765r1c2">         International Year One courses at NTIC       </td></tr><tr><td headers="table76765r1c1">         Completion of first year of Chinese university degree       </td><td headers="table76765r1c2">         First year bachelors degrees       </td></tr><tr><td headers="table76765r1c1">         Three year diploma or higher national diploma       </td><td headers="table76765r1c2">         Considered for final year entry to selected bachelors degrees or for Pre-Masters courses at <a href="https://www.kaplanpathways.com/colleges/nottingham-trent-international-college/courses/">Nottingham Trent International College</a></td></tr><tr><td headers="table76765r1c1">         Bachelors degree (four years or six years in medicine / dentistry) from recognised institution in China. <br />Grades of 75% or above<br />Grades of 70% or above from 211 universities       </td><td headers="table76765r1c2">         Postgraduate (Masters) courses       </td></tr><tr><td headers="table76765r1c1">         Masters degree from a recognised institution in China.<br />Grades of 70% or above       </td><td headers="table76765r1c2">         Postgraduate research       </td></tr></tbody></table><p>If you have questions about your qualification and it is not listed here, please <a href="mailto:[email protected]">contact us</a> for advice.</p>
"""
                ]))

            ucascode = response.xpath(
                "//strong[contains(text(),'UCAS code(s):')]/following-sibling::*//text()"
            ).extract()
            clear_space(ucascode)
            if len(ucascode) > 0:
                item['ucascode'] = ''.join(ucascode[0]).replace(" / ",
                                                                "/").strip()
            print("item['ucascode']: ", item['ucascode'])

            duration = response.xpath(
                "//strong[contains(text(),'Course duration:')]/following-sibling::span//text()"
            ).extract()
            print("duration: ", duration)
            duration_str = ''.join(duration).replace("/ sandwich", "").strip()
            duration_list = getIntDuration(''.join(duration))
            # print("duration_list: ", duration_list)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            duration_per = item['duration_per']
            print("item['duration'] = ", item['duration'])
            print("item['duration_per'] = ", item['duration_per'])

            if "/" in item['ucascode']:
                ucascode_sp = item['ucascode'].split("/")
                if "/" in duration_str:
                    duration_sp = duration_str.split("/")
                elif " or" in duration_str:
                    duration_sp = duration_str.split(" or")
                elif "," in duration_str:
                    duration_sp = duration_str.split(" or")
                else:
                    duration_sp = [duration_str, duration_str]
                print("ucascode_sp: ", ucascode_sp)
                print("duration_sp: ", duration_sp)
                if len(ucascode_sp) == 2:
                    item['ucascode'] = ucascode_sp[0]
                    duration_list = getIntDuration(duration_sp[0])
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                    if item['duration'] == None:
                        item['duration'] = int(duration_sp[0].strip())
                        item['duration_per'] = duration_per
                    print("item['ucascode']1: ", item['ucascode'])
                    print("item['duration']1 = ", item['duration'])
                    print("item['duration_per']1 = ", item['duration_per'])
                    yield item

                    item['ucascode'] = ucascode_sp[-1]
                    duration_list = getIntDuration(duration_sp[-1].strip())
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                    if item['duration'] == None:
                        item['duration'] = int(duration_sp[-1].replace(
                            "year", "").replace("(s)", "").strip())
                        item['duration_per'] = 1
                    print("item['ucascode']2: ", item['ucascode'])
                    print("item['duration']2 = ", item['duration'])
                    print("item['duration_per']2 = ", item['duration_per'])
                    yield item
            else:
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 4

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "University of Surrey"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item[
            'location'] = '01SE01, Senate House, University of Surrey, Guildford, Surrey GU2 7XH'
        # print("item['location'] = ", item['location'])
        print("===============================")
        print(response.url)
        try:
            overview = response.xpath(
                "//h3[contains(text(),'Course facts')]/../preceding-sibling::*"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en'] = ", item['overview_en'])

            career = response.xpath(
                "//h2[contains(text(),'Careers')]/preceding-sibling::*[1]/following-sibling::*[position()<last()-3]"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # if item['career_en'] == "":
            #     print("***career_en")
            # print("item['career_en'] = ", item['career_en'])

            modules = response.xpath(
                "//div[@class='module-list']/following-sibling::*[1]/preceding-sibling::*"
            ).extract()
            # modules1 = response.xpath("//div[@id='modules-ft']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # if item['modules_en'] == "":
            #     print("***modules_en")
            # print("item['modules_en'] = ", item['modules_en'])

            assessment_en = response.xpath(
                "//h2[contains(text(),'Teaching')]/preceding-sibling::*[1]/following-sibling::*[position()<7]|"
                "//h2[contains(text(),'Assessment')]/preceding-sibling::*[1]/following-sibling::*[position()<3]"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # if item['assessment_en'] == "":
            #     print("***assessment_en")
            # print("item['assessment_en'] = ", item['assessment_en'])

            # //a[contains(text(), 'Faculty of')]|//a[contains(text(), 'School of')]
            department = response.xpath(
                "//a[contains(text(), 'Faculty of')]//text()|"
                "//a[contains(text(), 'School of')]//text()").extract()
            item['department'] = remove_class(
                clear_lianxu_space(department)).replace(
                    "academic staff in the", "").strip()
            # if item['department'] == "":
            #     print("***department")
            # print("item['department'] = ", item['department'])

            entry_requirements = response.xpath(
                "//div[@id='entry-collapse']").extract()
            item['apply_desc_en'] = remove_class(
                clear_lianxu_space(entry_requirements))
            # print("item['apply_desc_en'] = ", item['apply_desc_en'])

            alevel = response.xpath(
                "//h3[contains(text(),'A-level')]/following-sibling::*[1]//text()"
            ).extract()
            alevel_str = ''.join(alevel).strip()
            if alevel_str == "Overall:" or alevel_str == "Overall":
                alevel = response.xpath(
                    "//h3[contains(text(),'A-level')]/following-sibling::*[position()<4]//text()"
                ).extract()
                alevel_str = ''.join(alevel).replace(
                    "Overall", "").strip().strip(":").strip()
                # print("***alevel")
            item['alevel'] = clear_space_str(alevel_str)
            # print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//h3[contains(text(),'International Baccalaureate')]/following-sibling::*[1]//text()"
            ).extract()
            ib_str = ''.join(ib).strip()
            if ib_str == "Overall:":
                ib = response.xpath(
                    "//h3[contains(text(),'International Baccalaureate')]/following-sibling::*[2]//text()"
                ).extract()
                ib_str = ''.join(ib).strip()
                # print("***ib")
            item['ib'] = ib_str
            # print("item['ib'] = ", item['ib'])

            ielts_str = response.xpath(
                "//div[@id='entry-collapse']//h2[contains(text(),'English')]/following-sibling::p[position()<4]//text()"
            ).extract()
            ielts_re = re.findall(r"^IELTS.{1,80}", ''.join(ielts_str))
            item['ielts_desc'] = ''.join(ielts_re).strip()
            # print("item['ielts_desc'] = ", item['ielts_desc'])

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['IELTS'] = %sitem['IELTS_L'] = %sitem['IELTS_S'] = %sitem['IELTS_R'] = %sitem['IELTS_W'] = %s==" % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            application_open_date = response.xpath(
                "//div[@class='p-3 p-xl-4 text-center text-light']//text()"
            ).extract()
            clear_space(application_open_date)
            # print("application_open_date: ", ''.join(application_open_date))
            item['application_open_date'] = getStartDate(
                ''.join(application_open_date))
            # if item['application_open_date'] == "":
            #     print("***application_open_date")
            # print("item['application_open_date'] = ", item['application_open_date'])

            tuition_fee = response.xpath(
                "//div[@id='fees']//tbody//tr[1]/td[last()-1]//text()"
            ).extract()
            # print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))

            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
            else:
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee'] = ", item['tuition_fee'])
            print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """ <h2>Process</h2>
<ol><li>Choose the programmes you want to study. Still undecided? Search our <a href="/undergraduate">undergraduate degrees</a></li>
<li>Find out <a href="/apply/undergraduate/how-to-apply-through-ucas">how to apply through UCAS</a></li>
<li>Wait for universities to make their decisions, <a href="/apply/undergraduate/after-you-apply">learn what happens after you apply</a></li>
<li>Reply to your <a href="/apply/undergraduate/your-offer">university offers</a></li>
<li><a href="/apply/undergraduate/your-offer">Confirm your university place</a></li>
</ol>"""
                ]))
            # print("item['apply_proces_en'] = ", item['apply_proces_en'])

            # https://www.surrey.ac.uk/china/entry-requirements
            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h2>Undergraduate</h2>
<p>We do not accept the Chinese National University Entrance Examination. However, you can apply to study for an <a href="http://isc.surrey.ac.uk/programmes/international-foundation-year?ch=uniweb&amp;cc=uniweb&amp;cid=uniweb&amp;utm_source=signposting&amp;utm_medium=signposting&amp;utm_campaign=uniweb&amp;_ga=2.246594701.825790074.1509959240-87246970.1500115796">International Foundation Year</a> at our <a href="http://isc.surrey.ac.uk/">International Study Centre</a>, which will prepare you for a full undergraduate degree course.</p>"""
                ]))

            # 专业、学位类型
            programme_en = response.xpath(
                "//h1[@class='text-center my-0']//text()").extract()
            programme_en_str = (''.join(programme_en).split("–"))[0].strip()
            print(programme_en_str)

            if "2019" in ''.join(programme_en):
                item['start_date'] = '2019'
            # print("item['start_date'] = ", item['start_date'])

            # 判断可以拆分几条数据，ucascode、duration、degree_name
            is_degree_name = response.xpath(
                "//tbody[@class='w-100']/tr").extract()
            # print("is_degree_name: ", is_degree_name)
            print(len(is_degree_name))
            for i in range(len(is_degree_name)):
                print("****************" + str(i + 1) + "***************")
                degree_name_re = re.findall(r"\w+\s\(Hons\).*|\w+$",
                                            programme_en_str)
                if len(degree_name_re) > 0:
                    item['degree_name'] = ''.join(degree_name_re).strip()
                    item['programme_en'] = programme_en_str.replace(
                        item['degree_name'], '').strip()
                else:
                    item['programme_en'] = programme_en_str
                print("item['programme_en'] = ", item['programme_en'])

                degree_name_xpath = response.xpath(
                    "//tbody[@class='w-100']//tr[" + str(i + 1) +
                    "]/td[1]//text()").extract()
                clear_space(degree_name_xpath)
                item['degree_name'] = ''.join(degree_name_xpath).strip()
                print("item['degree_name'] = ", item['degree_name'])

                duration = response.xpath("//tbody[@class='w-100']//tr[" +
                                          str(i + 1) +
                                          "]/td[2]//text()").extract()
                clear_space(duration)
                # print("duration: ", duration)
                if len(duration) != 0:
                    duration_list = getIntDuration(''.join(duration))
                    # print("duration_list: ", duration_list)
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                print("item['duration'] = ", item['duration'])
                print("item['duration_per'] = ", item['duration_per'])

                ucascode = response.xpath("//tbody[@class='w-100']//tr[" +
                                          str(i + 1) +
                                          "]/td[4]//text()").extract()
                clear_space(ucascode)
                item['ucascode'] = ''.join(ucascode).strip()
                print("item['ucascode']: ", item['ucascode'])

                tick = response.xpath(
                    "//tbody[@class='w-100']//tr[" + str(i + 1) +
                    "]/td[3]//i[@class='icon icon-tick']").extract()
                clear_space(tick)
                print("tick: ", tick)
                print(len(tick))
                if len(tick) == 1:
                    item['other'] = 'Professional Training'
                print("item['other']: ", item['other'])
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: KingCollegeLondon_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.kcl.ac.uk/"
        item['university'] = "King's College London"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = "Strand, London. WC2R 2LS, United Kingdom"
        print("===============================")
        print(response.url)
        try:
            # //div[@id='container']/div[@class='hero clearfix']/div[@class='wrapper']/div[@class='inner']/h1
            # 专业、学位类型
            programmeDegree = response.xpath(
                "//div[@id='container']/div[@class='hero clearfix']/div[@class='wrapper']/div[@class='inner']/h1//text()"
            ).extract()
            clear_space(programmeDegree)
            programmeDegreeStr = ''.join(programmeDegree)
            print(programmeDegreeStr)
            degree_type = re.findall(
                r"(\s\w+)$|(\s\w+\s\(.*\))$|(\s\w+/\w+)$|(\s\w+/\w+/\w+)$",
                programmeDegreeStr)
            if len(degree_type) > 0:
                degree_type = list(degree_type[0])
            while '' in degree_type:
                degree_type.remove('')
            print("degree_type = ", degree_type)
            item['degree_name'] = ''.join(degree_type).strip()
            programme = programmeDegreeStr.replace(item['degree_name'],
                                                   '').strip()
            item['programme_en'] = programme
            print("item['degree_name'] = ", item['degree_name'])
            print("item['programme_en'] = ", item['programme_en'])

            ucascode = response.xpath(
                "//strong[contains(text(),'UCAS code')]/following-sibling::*//text()"
            ).extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            if "," in item['ucascode']:
                item['ucascode'] = item['ucascode'].split(',')[0].strip()
            print("item['ucascode']: ", item['ucascode'])

            # //div[@id='tabs-key-info']/div[@class='tab tab-1 active-tab']/p[2]/span
            duration = response.xpath(
                "//strong[contains(text(),'Duration')]/following-sibling::*//text()"
            ).extract()
            durationStr = ''.join(duration)
            print(durationStr)
            # duration_re = re.findall(r"([a-zA-Z0-9]+\s)(year|month|week){1}", durationStr, re.I)
            duration_re = re.findall(
                r"([a-zA-Z0-9\.]+\s)(year|month|week|yr|yft){1}|([0-9\.]+)(yr|yft|\-month){1}",
                durationStr, re.I)
            # print(duration_re)
            d_dict = {
                "One": "1",
                "Two": "2",
                "Three": "3",
                "Four": "4",
                "Five": "5",
                "Six": "6",
                "Seven": "7",
                "Eight": "8",
                "Nine": "9",
                "Ten": "10",
                "one": "1",
                "two": "2",
                "three": "3",
                "four": "4",
                "five": "5",
                "six": "6",
                "seven": "7",
                "eight": "8",
                "nine": "9",
                "ten": "10",
            }
            if len(duration_re) > 0:
                d_int = re.findall(r"\d+", ''.join(duration_re[0]))
                if len(d_int) > 0:
                    item['duration'] = int(''.join(d_int))
                else:
                    d = re.findall(
                        r"(One)|(Two)|(Three)|(Four)|(Five)|(Six)|(Seven)|(Eight)|(Nine)|(Ten)|(one)|(two)|(three)|(four)|(five)|(six)|(seven)|(eight)|(nine)|(ten)",
                        ', '.join(duration_re[0]))
                    # print("d = ", d)
                    if len(d) > 0:
                        item['duration'] = int(
                            d_dict.get(''.join(d[0]).strip()))
                if "y" in ''.join(duration_re[0]) or "Y" in ''.join(
                        duration_re[0]):
                    item['duration_per'] = 1
                elif "m" in ''.join(duration_re[0]) or "M" in ''.join(
                        duration_re[0]):
                    item['duration_per'] = 3
                elif "w" in ''.join(duration_re[0]) or "W" in ''.join(
                        duration_re[0]):
                    item['duration_per'] = 4
            print("item['duration'] = ", item['duration'])
            print("item['duration_per'] = ", item['duration_per'])

            # //div[@id='tabs-key-info']/div[@class='tab tab-2']
            includeDepartment = response.xpath(
                "//div[@class='tab tab-2']//p[contains(text(), 'Faculty')]/span//text()"
            ).extract()
            if len(includeDepartment) == 0:
                includeDepartment = response.xpath(
                    "//p[contains(text(), 'Department')]/span//text()"
                ).extract()
            clear_space(includeDepartment)
            item['department'] = ''.join(includeDepartment).strip()
            print("item['department'] = ", item['department'])

            # //div[@id='coursepage-overview']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate']
            overview = response.xpath(
                "//div[@id='coursepage-overview']/div[@class='wrapper clearfix']/div[1]"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            print("item['overview_en'] = ", item['overview_en'])

            # //div[@id='coursepage-course-detail']/div[@class='wrapper clearfix']/div
            # modules = response.xpath("//h3[contains(text(),'Course format and assessment')]/preceding-sibling::*").extract()
            modules = response.xpath(
                "//div[@id='coursepage-course-detail']/div[@class='wrapper clearfix']/div[@class='inner right lop-to-measure']"
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            print("item['modules_en'] = ", item['modules_en'])

            assessment_en = response.xpath(
                "//h3[contains(text(),'Course format and assessment')]/preceding-sibling::*[1]/following-sibling::*|"
                "//h3[contains(text(),'Course Structure & Assessment')]/preceding-sibling::*[1]/following-sibling::*[position()<last()-1]|"
                "//h3[contains(text(),'Teaching style')]/preceding-sibling::*[1]/following-sibling::*[position()<last()-1]|"
                "//*[contains(text(),'Teaching')]/preceding-sibling::*[1]/following-sibling::*[position()<last()-3]|"
                "//*[contains(text(),'TEACHING')]/../preceding-sibling::*[1]/following-sibling::*[position()<last()-3]|"
                "//*[contains(text(),'Teaching')]/../preceding-sibling::*[1]/following-sibling::*[position()<last()-3]|//strong[contains(text(),'Teaching')]/../preceding-sibling::*[1]/following-sibling::*|"
                "//b[contains(text(),'Teaching')]/../preceding-sibling::*[1]/following-sibling::*"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            print("item['assessment_en'] = ", item['assessment_en'])

            alevel = response.xpath(
                "//div[@class='requirements EntryReqs_UKALevel clearfix']//b[contains(text(),'Required grades')]/../following-sibling::p[1]//text()"
            ).extract()
            if len(alevel) == 0:
                alevel = response.xpath(
                    "//strong[contains(text(),'A-Level')]/../following-sibling::td[1]//text()"
                ).extract()
                if len(alevel) == 0:
                    alevel = response.xpath(
                        "//div[@class='requirements EntryReqs_UKALevel clearfix']//div[@class='required-grades']//text()//text()"
                    ).extract()
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel).strip()[:160]
            print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                # "//div[@class='requirements EntryReqs_UKIB clearfix']//b[contains(text(),'Required grades')]/../following-sibling::p[1]//text()|"
                "//div[@class='requirements EntryReqs_UKIB clearfix']//div[@class='required-grades']//text()"
            ).extract()
            if len(ib) == 0:
                ib = response.xpath(
                    "//div[@class='requirements EntryReqs_UKIB clearfix']//b[contains(text(),'Required grades')]/../../text()"
                ).extract()
                if len(ib) == 0:
                    ib = response.xpath(
                        "//strong[contains(text(),'International Baccalaureate')]/../following-sibling::td[1]//text()"
                    ).extract()
            clear_space(ib)
            if len(ib) > 0:
                item['ib'] = ''.join(ib).strip()[:160]
            print("item['ib'] = ", item['ib'])

            # //div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate lopped-off expanded']
            entry_requirements = response.xpath(
                "//div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[1]//text()"
            ).extract()
            # item['rntry_requirements'] =clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])

            item['require_chinese_en'] = '''<h4><b>Undergraduate entry</b></h4>
<p>The Senior High School Certificate and/or <i>Hui&nbsp;</i><i>Kao</i>&nbsp;are not considered suitable for direct entry to undergraduate study at King's. Applicants may wish to consider taking one of our International Foundation programmes (see below).</p>'''

            # //div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate lopped-off expanded']
            IELTS = response.xpath(
                "//*[contains(text(),'English')]/../../following-sibling::td[1]//text()|"
                "//*[contains(text(),'English')]/../following-sibling::td[1]//text()|"
                "//strong[contains(text(),'TOEFL iBT')]/../../following-sibling::td[1]//text()|"
                "//*[contains(text(),'English')]//../following-sibling::td[1]//text()"
            ).extract()
            clear_space(IELTS)
            # print(IELTS)
            item['ielts_desc'] = ''.join(IELTS).strip()
            item['toefl_desc'] = item['ielts_desc']
            print("item['ielts_desc'] = ", item['ielts_desc'])

            if item['ielts_desc'] == "Band A":
                item["ielts"] = 7.5  # float
                item["ielts_l"] = 7.0  # float
                item["ielts_s"] = 7.0  # float
                item["ielts_r"] = 7.0  # float
                item["ielts_w"] = 7.0
                item["toefl"] = 109  # float
                item["toefl_l"] = 25  # float
                item["toefl_s"] = 25  # float
                item["toefl_r"] = 25  # float
                item["toefl_w"] = 27
            elif item['ielts_desc'] == "Band B" or item[
                    'department'] == "The Dickson Poon School of Law" or item[
                        'department'] == "Dental Institute" or "Medicine" in item[
                            'programme_en']:
                item["ielts"] = 7.0  # float
                item["ielts_l"] = 6.5  # float
                item["ielts_s"] = 6.5  # float
                item["ielts_r"] = 6.5  # float
                item["ielts_w"] = 6.5
                item["toefl"] = 100  # float
                item["toefl_l"] = 23  # float
                item["toefl_s"] = 23  # float
                item["toefl_r"] = 23  # float
                item["toefl_w"] = 25
            elif item['ielts_desc'] == "Band D" or "Biochemistry" in item[
                    'programme_en']:
                item["ielts"] = 6.5  # float
                item["ielts_l"] = 6.0  # float
                item["ielts_s"] = 6.0  # float
                item["ielts_r"] = 6.0  # float
                item["ielts_w"] = 6.0
                item["toefl"] = 92  # float
                item["toefl_l"] = 20  # float
                item["toefl_s"] = 20  # float
                item["toefl_r"] = 20  # float
                item["toefl_w"] = 23
            elif item['ielts_desc'] == "Band E":
                item["ielts"] = 6.0  # float
                item["ielts_l"] = 5.5  # float
                item["ielts_s"] = 5.5  # float
                item["ielts_r"] = 5.5  # float
                item["ielts_w"] = 5.5
                item["toefl"] = 80  # float
                item["toefl_l"] = 20  # float
                item["toefl_s"] = 20  # float
                item["toefl_r"] = 20  # float
                item["toefl_w"] = 20

            if item['ielts_desc'] == "":
                ielts_desc = response.xpath(
                    "//strong[contains(text(),'IELTS Academic')]/../../../following-sibling::td[1]//text()"
                ).extract()
                item['ielts_desc'] = ''.join(ielts_desc).strip()
            if item['toefl_desc'] == "":
                toefl_desc = response.xpath(
                    "//strong[contains(text(),'TOEFL iBT')]/../../following-sibling::td[1]//text()"
                ).extract()
                item['toefl_desc'] = ''.join(toefl_desc).strip()

            if item['ielts'] == None:
                ielts_dict = get_ielts(item['ielts_desc'])
                item["ielts"] = ielts_dict.get('IELTS')  # float
                item["ielts_l"] = ielts_dict.get('IELTS_L')  # float
                item["ielts_s"] = ielts_dict.get('IELTS_S')  # float
                item["ielts_r"] = ielts_dict.get('IELTS_R')  # float
                item["ielts_w"] = ielts_dict.get('IELTS_W')
            if item['toefl'] == None:
                toefl_dict = get_ielts(item['toefl_desc'])
                item["toefl"] = toefl_dict.get('TOEFL')  # float
                item["toefl_l"] = toefl_dict.get('TOEFL_L')  # float
                item["toefl_s"] = toefl_dict.get('TOEFL_S')  # float
                item["toefl_r"] = toefl_dict.get('TOEFL_R')  # float
                item["toefl_w"] = toefl_dict.get('TOEFL_W')
                # //div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate lopped-off expanded']/div[@class='requirements uk clearfix']/div[@class='copy'][2]/p[1]
            application_fee = response.xpath(
                "//h3[contains(text(), 'Application procedure')]/following-sibling::div[1]//text()"
            ).extract()
            clear_space(application_fee)
            # print(''.join(application_fee))
            application_fee_re = re.findall(r"application\sfee.*£\d+",
                                            ''.join(application_fee))
            print("apply_fee: ", ''.join(application_fee_re))
            af = ''.join(application_fee_re).replace("application fee of",
                                                     "").replace("£",
                                                                 "").strip()
            if len(af) != 0:
                item['apply_fee'] = int(af)
                item['apply_pre'] = "£"
            print("item['apply_fee'] = ", item['apply_fee'])

            # //div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate lopped-off expanded']/div[@class='requirements uk clearfix']/div[@class='copy'][2]/p[1]
            application_documents = response.xpath(
                "//h3[contains(text(), 'Personal statement and supporting information')]/following-sibling::div[1]"
            ).extract()
            item['apply_documents_en'] = remove_class(
                clear_lianxu_space(application_documents))
            print("item['apply_documents_en'] = ", item['apply_documents_en'])

            # //div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate lopped-off expanded']/div[@class='requirements uk clearfix']/div[@class='copy'][2]/p[1]
            deadline = response.xpath(
                "//div[@id='coursepage-entry-requirements']/div[@class='wrapper clearfix']/div[1]/div[@class='requirements uk clearfix']/div[@class='copy'][4]//text()"
            ).extract()
            clear_space(deadline)
            print(deadline)
            deadline_str = ''.join(deadline).strip()
            item['deadline'] = getStartDate(deadline_str)
            print("item['deadline'] = ", item['deadline'])

            # //div[@id='coursepage-fees-and-funding']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate lopped-off']/ul[1]/li[2]
            tuition_fee = response.xpath(
                "//p[contains(text(),'The International tuition fee for the 2018-2019 ac')]//text()"
            ).extract()
            print("tuition_fee = ", tuition_fee)
            tuition_fee_re = re.findall(r"£\d+,\d+|£\d+|\d+,\d+",
                                        ''.join(tuition_fee))
            # print(tuition_fee_re)
            if len(tuition_fee_re) >= 1:
                item['tuition_fee_pre'] = "£"
                item['tuition_fee'] = int(tuition_fee_re[0].replace(
                    "£", "").replace(",", "").strip())
            print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])
            print("item['tuition_fee'] = ", item['tuition_fee'])

            # //div[@id='coursepage-career-prospect']/div[@class='wrapper clearfix']/div[@class='inner left lop-to-truncate']
            career = response.xpath(
                "//div[@id='coursepage-career-prospect']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            print("item['career_en'] = ", item['career_en'])

            # //b[contains(text(),'The interview')]/..|//b[contains(text(),'The interview')]/../following-sibling::*[position()<3]
            interview_desc_en = response.xpath(
                "//b[contains(text(),'The interview')]/..|"
                "//b[contains(text(),'The interview')]/../following-sibling::*[position()<3]|"
                "//b[contains(text(),'Interviewing')]/../following-sibling::*[1]"
            ).extract()
            item['interview_desc_en'] = remove_class(
                clear_lianxu_space(interview_desc_en))
            print("item['interview_desc_en'] = ", item['interview_desc_en'])

            # //b[contains(text(),'Application deadline:')]/..
            deadline = response.xpath(
                "//b[contains(text(),'Application deadline:')]/../text()"
            ).extract()
            item['deadline'] = remove_class(clear_lianxu_space(deadline))
            print("item['deadline'] = ", item['deadline'])

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """ <h1>Applying to King&#39;s College London</h1>
<img alt="King's College London lecture at the Strand campus" height="330" width="780" src="/ImportedImages/0Prospectus/undergraduate/apply/generic-page-images/lecturer-2014-strand-6.29.jpg" />
<p><br />We're delighted that you're considering applying to King's. Once you've checked the information and&nbsp;<a class="sys_16" href="https://www.kcl.ac.uk/study/undergraduate/apply/entry-requirements/index.aspx">entry requirements</a>&nbsp;for your chosen course, you will need to follow the correct application procedure, depending on the type of study you're interested in:</p>
<div class="contentpage-accordion clearfix">
<h3 class="accordion-toggle">Undergraduate degree courses (UCAS)</h3>
<div class="accordion-content">
<p class="p1">For all full-time undergraduate higher education courses at universities and colleges in the UK you must make an online application via the Universities and Colleges Admissions Service - more commonly known as&nbsp; <a class="sys_16" href="http://www.ucas.com/">UCAS</a>.<br /><br /></p>
<h4 class="p1"><b>UCAS has three key functions:</b></h4>
<ol>
<li class="p4"><a class="sys_16" href="https://digital.ucas.com/search">Course search</a>:&nbsp;allows you to search for courses throughout the UK. Remember to always check King's&nbsp;online prospectus&nbsp;for the most detailed information on King's courses.</li>
<li class="p4"><a class="sys_16" href="https://www.ucas.com/ucas/undergraduate/register">Apply</a>:&nbsp;the UCAS online application system. You should use this to make your application(s) to King's. 'Apply' will allow you to apply to several different universities and/or courses at once.</li>
<li class="p4"><a class="sys_16" href="https://www.ucas.com/ucas/undergraduate/login">Track</a>:&nbsp;a central tracking system for following the progress of your different applications. King's will also provide you with an account for our own supplementary tracking and messaging system (called King's Apply).</li>
</ol>
<p class="p1">Please read the following guidelines before making your application.</p>
<h4 class="p1"><b><br />Who should use UCAS?</b></h4>
<ul>
<li class="p2">All applicants for full-time undergraduate courses at King's should apply through UCAS (with the exception of applicants from&nbsp; <a class="sys_16" href="http://www.kcl.ac.uk/usa">North America</a>&nbsp;who may use Common App if preferred).</li>
<li class="p2">All applicants for Nursing with registration (graduate entry) PG Dip</li>
<li class="p2">All applicants for Midwifery with registration (graduate entry) PG Dip</li>
</ul>
<p>You can apply through your school or college, or as an individual.</p>
<h4 class="p1"><b><br />When should I apply?</b></h4>
<p class="p2">You can apply to UCAS from 1 September for entry the following autumn, but remember you can start&nbsp; <a title="Undergraduate study" class="sys_0 sys_t0" href="/study/undergraduate/index.aspx">doing your research</a>, attending&nbsp;open days, and preparing your personal statement earlier than this.</p>
<p class="p2">The normal closing date for receipt of applications is&nbsp;15&nbsp;January.&nbsp;However if you are including Oxford or Cambridge, or to Medicine or Dentistry, then the closing date is&nbsp;15 October&nbsp;in the year prior to entry.&nbsp;</p>
<p class="p2">The UCAS website states a more flexible deadline for international students, however, any application received by King's after the above dates is considered late.</p>
<h4 class="p1"><b><br />How do I use UCAS?</b></h4>
<p class="p2">UCAS allows you to apply to a maximum of five courses per year, but only four of those may be Medicine/Dentistry courses.</p>
<p class="p2">You will need to create an account in UCAS 'Apply' and complete an application form. Your application will then be forwarded by UCAS to all of the universities you have applied to for us to consider.</p>
<p class="p2">If you have participated in a King's widening participation scheme such as K+, please ensure you note this in your application as advised by the&nbsp;<a title="Widening Participation" class="sys_0 sys_t0" href="/study/widening-participation/index.aspx">Widening Participation team</a>.</p>
<p class="p2">UCAS has detailed instructions on the&nbsp;<a class="sys_16" href="http://www.ucas.com/how-it-all-works/undergraduate">UCAS website</a>.</p>
<p class="p2">You can also read our&nbsp;<a title="Before you apply" class="sys_0 sys_t7240628" href="/study/undergraduate/apply/faqs/index.aspx">frequently asked questions about applying</a>.<a title="UCAS website" class="sys_16" href="https://www.ucas.com/"><span class="kcl_BigRedButton">Apply now</span></a></p>

</div>



<h3 class="accordion-toggle">Undergraduate degree courses (Common App)</h3>
<div class="accordion-content">

<p>King's College Lon</p>
<h4>UCAS or Common App?</h4>
<p>We will only consider applicants through The Common Application who have not also applied through UCAS.&nbsp;</p>
<h4>Who should use Common App?</h4>
<ul>
<li>Common App is an option to be used by students who will be classified as paying international fees. Therefore Home/EU fee students must use UCAS. Those who are unsure should use UCAS.</li>
</ul>
<ol>
<li>Common App is available for all programmes excluding Physiotherapy, Nursing, Medicine, Dentistry, and Nutrition and Dietetics. Students must use UCAS to apply to these programmes.</li>
<li>Students must apply to no more than a combined total of five courses (UCAS and Common App) within the UK.</li>
</ol>
<p>All Common App applicants will be expected to complete the supplement element of the Common Application. We strongly encourage students to submit their application by January 15, but the College may consider later applications up to May 1. Once a Common Application is submitted to King's, students will be registered on the College's MyApplication system through which they will be able to track the progress of their application.</p>
<h4>King&rsquo;s College London Common Application timeline</h4>
<ol>
<li>Applicant applies to King&rsquo;s using the&nbsp;<a class="sys_16" href="http://www.commonapp.org/">Common Application form</a>.</li>
<li>The application is transferred onto the KCL Admissions Portal by King&rsquo;s admissions staff. This process may take approximately 30 days, depending upon when you submitted your application.</li>
<li>Once the application has been successfully inputted into the system, the applicant receives login details for the &lsquo;<a class="sys_16" href="https://myapplication.kcl.ac.uk/">MyApplication</a>&rsquo; admissions portal. This is used to track the progress of an application and communicate with admissions staff</li>
<li>Applicants will be contacted through&nbsp;<a class="sys_16" href="https://myapplication.kcl.ac.uk/">myApplication</a>&nbsp;in the event that any further supporting documents are required before the application can be fully processed.</li>
<li>Decisions on completed applications submitted by January 15 will be made before March 31st. All notifications of decisions will be sent through&nbsp;<a class="sys_16" href="https://myapplication.kcl.ac.uk/">myApplication</a>.</li>
</ol>
<p>&nbsp;</p>

</div>



<h3 class="accordion-toggle">Post Qualification Nursing BSc Programmes &amp; Free Standing Courses</h3>
<div class="accordion-content">

<p>Applications for our post qualification nursing BSc programmes should be made direct to King&rsquo;s, through the King&rsquo;s Apply portal. <a title="Apply now" class="sys_16" onclick="void(window.open('https://apply.kcl.ac.uk/','','toolbar=yes,menubar=yes,location=yes,scrollbars=yes,status=yes,resizable=yes'));return false;" onkeypress="void(window.open('https://apply.kcl.ac.uk/','','toolbar=yes,menubar=yes,location=yes,scrollbars=yes,status=yes,resizable=yes'));return false;" href="https://apply.kcl.ac.uk/"><span class="kcl_BigRedButton">Apply now</span></a></p>

</div>



<h3 class="accordion-toggle">Intercalated BScs (iBScs)</h3>
<div class="accordion-content">

<p>Intercalated BScs (iBScs) programmes are for medical, dental and veterinary students. If you are from another university and you are interested in studying an iBSc at King&rsquo;s, then please see our detailed information on <a title="Intercalated BScs: How to apply" class="sys_0 sys_t7240628" href="/study/subject-areas/intercalated/how-to-apply.aspx">how to apply</a>. If you are a current King&rsquo;s student, please refer to application information on the <a class="sys_16" href="https://internal.kcl.ac.uk/lsm/students/ug/intercalated-bsc/how-to-apply.aspx">King's internal website</a>.<a title="King&#39;s Apply" class="sys_16" href="https://apply.kcl.ac.uk/"><span class="kcl_BigRedButton">Apply now</span></a></p>

</div>



<h3 class="accordion-toggle">Transfers to undergraduate degree courses</h3>
<div class="accordion-content">

<p>Some of our academic departments may consider transfer applications from suitably qualified students currently attending other universities. Visit the&nbsp;<a class="sys_0 sys_t2452" href="https://www.kcl.ac.uk/study/undergraduate/apply/transferring-to-kings.aspx">transferring to King&rsquo;s web page</a>&nbsp;for more information. Transfer applications must be submitted through UCAS</p>
<p><a title="UCAS" class="sys_16" href="https://www.ucas.com/"><span class="kcl_BigRedButton">Apply now</span></a></p>

</div>



<h3 class="accordion-toggle">King's International Foundation Programme</h3>
<div class="accordion-content">

<p>Applications for our one year full-time International Foundation academic preparation course should be made direct to King's, through the <a class="sys_16" href="https://apply.kcl.ac.uk/">King&rsquo;s Apply portal</a>. We have detailed guidance on the supporting documentation needed for your application on the relevant International Foundation Programme course web page.<a title="King&#39;s Apply" class="sys_16" href="https://apply.kcl.ac.uk/"><span class="kcl_BigRedButton">Apply now</span></a></p>

</div>



<h3 class="accordion-toggle">Study Abroad</h3>
<div class="accordion-content">

<p>King's welcomes students currently enrolled in universities outside the UK to <a title="King&#39;s Apply" class="sys_16" href="https://apply.kcl.ac.uk/">participate</a> on a study abroad programme.&nbsp;</p>
<p>You can study abroad at King's either as an exchange or a study abroad fee-paying student for the full academic year (starting in September) or for one semester only (September-December or January-June).&nbsp;</p>
<p>Visit our&nbsp;<a class="sys_16" title="Study abroad" href="/study/abroad/index.aspx">Study Abroad web pages</a>&nbsp;to find out more.<a title="King&#39;s Apply" class="sys_16" href="https://apply.kcl.ac.uk/"><span class="kcl_BigRedButton">Apply now</span></a></p>

</div>


</div>

<h3><br />After you&rsquo;ve applied</h3>
<p>Your application can be tracked using our&nbsp;<a class="sys_16" title="King&#39;s application portal" onclick="void(window.open('https://myapplication.kcl.ac.uk/',''));return false;" onkeypress="void(window.open('https://myapplication.kcl.ac.uk/',''));return false;" href="https://myapplication.kcl.ac.uk/">online portal, King's Apply</a>, where you can:</p>
<ul>
<li>
<p>see offer details</p>
</li>
<li>
<p>check if you&rsquo;ve been invited to interview</p>
</li>
<li>
<p>apply for accommodation</p>
</li>
<li>
<p>learn more about the &lsquo;points-based&rsquo; visa system</p>
</li>
</ul>
<p>After you have applied to King's, we will send you a username and password so you can access these pages. To contact us about your application during the application year, please use our <a title="King&#39;s application portal" class="sys_16" onclick="void(window.open('https://myapplication.kcl.ac.uk/',''));return false;" onkeypress="void(window.open('https://myapplication.kcl.ac.uk/',''));return false;" href="https://myapplication.kcl.ac.uk/">online portal, King's Apply</a>&nbsp;.</p>
<p><a class="sys_0 sys_t2452" title="Tracking your application" href="https://www.kcl.ac.uk/study/undergraduate/apply/faqs/tracking-your-application.aspx">Read our FAQs on tracking your application</a></p>


<div class="contentpage-accordion clearfix">
<br />


<h3 class="accordion-toggle">Cancellation rights</h3>
<div class="accordion-content">

<p class="MRNoHead2">Please note, these terms and conditions apply to all levels of study.&nbsp; For applications to undergraduate study, we also advise applicants to contact UCAS directly for details of your cancellation rights.</p>
<p class="MRNoHead2">1.1&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; You have the right to cancel your acceptance of a place at King&rsquo;s for any reason (including if you change your mind) during a fourteen (14) day cancellation period (the &ldquo;Cancellation Period&rdquo;), which will start on the day you accept an offer from King&rsquo;s.</p>
<p class="MRNoHead2">1.2 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; To cancel your acceptance, you must clearly inform us in writing of your decision to cancel before the Cancellation Period has expired. We ask that you do this by sending a message through &ldquo;King&rsquo;s Apply&rdquo;. Alternatively, you may contact the King&rsquo;s Admissions Office by letter or email. You may also use the <a title="Cancellation Form - Kings College London" class="sys_17" href="/study/assets/word/admissions/v.2-cancellation-form.docx">Cancellation Form </a>to notify us of your decision to cancel.</p>
<p class="MRNoHead2">1.3 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; To meet the cancellation deadline, it is sufficient for you to send your communication concerning your exercise of the right to cancel before the Cancellation Period has expired. We do not have to have received it before the expiry of the Cancellation Period.</p>
<p class="MRNoHead2">1.4&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; If you cancel your acceptance within the 14 day Cancellation Period, we will reimburse any tuition fee payment including any deposit received from you as soon as we can, and no later than 14 days after the day on which you informed us of your decision to cancel your acceptance.</p>

</div>


</div>

"""
                ]))
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 6

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        item['university'] = "Loughborough University"
        # item['country'] = 'England'
        # item['website'] = 'http://www.lboro.ac.uk/'
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            # 学位名称
            degree_name = response.xpath(
                "//span[@class='course-info__qualification course-info__qualification--default active']//text()").extract()
            # print("degree_name: ", degree_name)
            item['degree_name'] = ''.join(degree_name).replace(', PG certificate', '').strip()
            print("item['degree_name']: ", item['degree_name'])

            # 专业
            programme_en = response.xpath(
                "//h1[@class='course-info__heading']/text()").extract()
            clear_space(programme_en)
            item['programme_en'] = ''.join(programme_en).strip()
            print("item['programme_en']: ", item['programme_en'])

            # 学院
            item['department'] = response.meta.get(item['programme_en'])
            if item['programme_en'] == "Finance and Management":
                item['department'] = "Business and Economics"
            print("item['department']: ", item['department'])

            # 授课类型
            # mode = response.xpath(
            #     "//dt[@class='list__item list__item--term'][contains(text(),'Full-time:')]//text()").extract()
            # clear_space(mode)
            # if len(mode) != 0:
            #     item['teach_time'] = 'fulltime'
            # print("item['teach_time']: ", item['teach_time'])


            start_date = response.xpath(
                "//span[@class='list__text'][contains(text(),'Start date')]/../following-sibling::dd[1]//text()").extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            if len(start_date) > 0:
                item['start_date'] = getStartDate(''.join(start_date[0]))
            # print("item['start_date']: ", item['start_date'])

            tuition_fee = response.xpath(
                "//span[@class='list__text'][contains(text(),'International fee')]/../following-sibling::dd//text()").extract()
            clear_space(tuition_fee)
            # print('tuition_fee: ', tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee_pre'] = '£'
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])
            # print("item['tuition_fee']: ", item['tuition_fee'])

            location = response.xpath(
                "//dt[@class='list__item list__item--term'][contains(text(),'Location:')]/following-sibling::dd//text()").extract()
            clear_space(location)
            item['location'] = ''.join(location).strip()
            # print("item['location']: ", item['location'])

            overview_en = response.xpath("//div[@id='overview']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # print("item['overview_en']: ", item['overview_en'])

            # modules
            modules_en = response.xpath("//div[@id='study']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules_en))
            # print("item['modules_en']: ", item['modules_en'])

            # teaching_assessment
            assessment_en = response.xpath("//div[@class='course-section course-section--assessed']").extract()
            item['assessment_en'] = remove_class(clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            alevel = response.xpath(
                "//dt[@class='list__item list__item--term'][contains(text(),'A-Level')]/following-sibling::dd//text()").extract()
            alevel = response.xpath(
                "//span[@class='list__text'][contains(text(),'Typical offer')]/../following-sibling::dd//text()").extract()
            clear_space(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[0]).strip()
            # print("item['alevel'] = ", item['alevel'])

            ib = response.xpath(
                "//dt[@class='list__item list__item--term'][contains(text(),'IB')]/following-sibling::dd//text()").extract()
            item['ib'] = ''.join(ib).strip()
            # print("item['ib'] = ", item['ib'])

            # //div[@id='china']
            require_chinese_en = response.xpath(
                "//div[@id='china']").extract()
            item['require_chinese_en'] = remove_class(clear_lianxu_space(require_chinese_en))
            # print("item['require_chinese_en'] = ", item['require_chinese_en'])

            item['ielts'] = 6.5
            item['ielts_l'] = 6.0
            item['ielts_s'] = 6.0
            item['ielts_r'] = 6.0
            item['ielts_w'] = 6.0
            if item['programme_en'] == "Communication and Media Studies":
                item['ielts'] = 7.0
                item['ielts_l'] = 6.0
                item['ielts_s'] = 6.0
                item['ielts_r'] = 6.0
                item['ielts_w'] = 6.0
            elif item['programme_en'] == "Information Management and Business" or item[
                'programme_en'] == "Accounting and Financial Management" or item[
                'programme_en'] == "Management Sciences" or item[
                'programme_en'] == "Retailing, Marketing and Management" or item[
                'programme_en'] == "International Business" or item['programme_en'] == "Finance and Management" or item[
                'programme_en'] == "Economics" or item['programme_en'] == "Business Economics and Finance" or item[
                'programme_en'] == "International Economics" or item['programme_en'] == "Economics with Geography" or \
                            item['programme_en'] == "Economics with Politics" or item[
                'programme_en'] == "Economics with Accounting" or item['programme_en'] == "Economics and Management":
                item['ielts'] = 7.0
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.5
                item['ielts_w'] = 6.5
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            career = response.xpath("//div[@id='career']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            item['apply_proces_en'] = "http://www.lboro.ac.uk/study/undergraduate/apply/"

            # //option[@value='default']//text()    //span[@class='form__option-value']
            duration = response.xpath("//label[contains(text(),'Study options')]/following-sibling::span//text()").extract()
            clear_space(duration)
            print("duration: ", duration)
            if len(duration) > 0:
                duration_list = getIntDuration(''.join(duration))
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
                print("item['duration'] = ", item['duration'])
                print("item['duration_per'] = ", item['duration_per'])

                # //span[@class='list__text'][contains(text(),'UCAS code')]/../following-sibling::dd
                ucascode = response.xpath(
                    "//span[@class='list__text'][contains(text(),'UCAS code')]/../following-sibling::dd//text()").extract()
                clear_space(ucascode)
                print("ucascode: ", ucascode)
                if len(ucascode) > 0:
                    item['ucascode'] = ''.join(ucascode[0]).strip()
                print("item['ucascode'] = ", item['ucascode'])
                yield item
            else:
                duration = response.xpath(
                    "//option[@value='default']//text()").extract()
                clear_space(duration)
                print("duration1: ", duration)

                duration_list = getIntDuration(''.join(duration))
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
                print("item['duration'] = ", item['duration'])
                print("item['duration_per'] = ", item['duration_per'])

                # //span[@class='list__text'][contains(text(),'UCAS code')]/../following-sibling::dd
                ucascode = response.xpath(
                    "//span[@class='list__text'][contains(text(),'UCAS code')]/../following-sibling::dd//text()").extract()
                clear_space(ucascode)
                print("ucascode: ", ucascode)
                if len(ucascode) > 0:
                    item['ucascode'] = ''.join(ucascode[0]).strip()
                print("item['ucascode']1 = ", item['ucascode'])
                yield item

                duration = response.xpath(
                    "//option[@value='variant']//text()").extract()
                clear_space(duration)
                print("duration2: ", duration)

                duration_list = getIntDuration(''.join(duration))
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
                print("item['duration'] = ", item['duration'])
                print("item['duration_per'] = ", item['duration_per'])

                # //span[@class='list__text'][contains(text(),'UCAS code')]/../following-sibling::dd
                ucascode = response.xpath(
                    "//span[@class='list__text'][contains(text(),'UCAS code')]/../following-sibling::dd//text()").extract()
                clear_space(ucascode)
                print("ucascode: ", ucascode)
                if len(ucascode) > 0:
                    item['ucascode'] = ''.join(ucascode[-1]).strip()
                print("item['ucascode']2 = ", item['ucascode'])
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/"+item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: AstonUniversity_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.aston.ac.uk/"
        item['university'] = "Aston University"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = "Aston University,Birmingham, B4 7ET"
        print("======================================")
        print(response.url)
        try:
            programmeDegreetype = response.xpath(
                "//h1[@id='skiplinks']//text()").extract()
            programmeDegreetypeStr = ''.join(programmeDegreetype)
            # print(programmeDegreetypeStr)
            degree_type = re.findall(
                r"^\w+/\w+|^\w+\s/\s\w+|^\w+\s\(Hons\)|^[BML]\w{1,7}\s",
                programmeDegreetypeStr)
            # print("degree_type = ", degree_type)
            item['degree_name'] = ''.join(degree_type).replace("(Hons)",
                                                               "").strip()
            if item['degree_name'] == "Business":
                item['degree_name'] = ""
            programme = programmeDegreetypeStr.replace(item['degree_name'],
                                                       "").strip()
            item['programme_en'] = ''.join(programme).replace(
                "(Hons)", "").strip().strip("in").strip()
            print("item['degree_name']: ", item['degree_name'])
            print("item['programme_en']: ", item['programme_en'])

            alevel = response.xpath(
                "//span[contains(text(),'A Levels')]/../../../../../../following-sibling::*[1]//text()|"
                "//span[contains(text(),'A-Levels')]/../../../../../../following-sibling::*[1]//text()"
            ).extract()
            clear_space(alevel)
            # print("alevel: ", alevel)
            item['alevel'] = ''.join(alevel).strip()

            if item['alevel'] == "":
                alevel1 = response.xpath(
                    "//strong[contains(text(),'A Levels')]/..//text()"
                ).extract()
                if "A Levels" in ''.join(alevel1).strip():
                    alevelindex = ''.join(alevel1).strip().index('A Levels')
                    item['alevel'] = ''.join(
                        ''.join(alevel1).strip()[alevelindex:]).strip()
            # print("item['alevel']: ", item['alevel'])
            if len(item['alevel']) > 300:
                item['alevel'] = item['alevel'][:301]

            ib = response.xpath(
                "//span[contains(text(),'IB')]/../../../../../../following-sibling::*[2]//text()|"
                "//strong[contains(text(),'IB')]/..//text()").extract()
            clear_space(ib)
            item['ib'] = ''.join(ib).strip()
            # print("item['ib']: ", item['ib'])
            if len(item['ib']) > 300:
                item['ib'] = item['ib'][:301]

            overview = response.xpath(
                "//a[contains(text(),'Course overview')]/../../../../../..|"
                "//*[contains(text(), 'Course outline')]/../../../../../../div/following-sibling::div[1]|"
                "//*[contains(text(), 'Course Outline')]/../../../../../div/../following-sibling::div[1]//*[contains(text(), 'Modules')]/../preceding-sibling::*|"
                "//*[contains(text(),'Course Outline')]/../../../../../following-sibling::div[1]//*[contains(text(),'Sample module options')]/../preceding-sibling::*|"
                "//*[contains(text(), 'Subject Guide & Modules')]/../../../../../../div/following-sibling::div[1]//*[contains(text(),'Sample module options')]/../preceding-sibling::*|"
                "//*[contains(text(), 'Course Outline')]/../../../../../div/../following-sibling::div[1]//*[contains(text(), 'Sample module options')]/../../preceding-sibling::*|"
                "//*[contains(text(), 'Subject Guide & Modules')]/../../../../../../div/following-sibling::div[1]//*[contains(text(),'Sample module options')]/..|"
                "//*[contains(text(), 'Subject Guide & Modules')]/../../../../../../div/following-sibling::div[1]//*[contains(text(),'Core modules:')]/../preceding-sibling::*|"
                "//strong[contains(text(),'Courses')]/../../following-sibling::div[1]|"
                "//*[contains(text(), 'Programme outline and modules')]/../../../../../../div/following-sibling::div[1]//*[contains(text(),'Modules')]/..|"
                "//*[contains(text(),'Course Outline')]/../../../../../following-sibling::div[1]//*[contains(text(),'Sample Module Options')]/../preceding-sibling::*|"
                "//*[contains(text(),'Course Outline & Modules')]/../../../../../following-sibling::div[1]//*[contains(text(),'Modules')]/preceding-sibling::*"
            ).extract()
            if len(overview) == 0:
                overview = response.xpath(
                    "//a[contains(text(),'Course Outline')]/../../../../../.."
                ).extract()
            item['overview_en'] = ''.join(
                remove_class(clear_lianxu_space(overview)).replace(
                    "<br>", "").strip().split("\n")).strip()
            # if item['overview_en'] == "":
            #     print("overview 为空")
            # print("item['overview_en'] = ", item['overview_en'])

            modules_en = response.xpath(
                "//*[contains(text(),'modules:')]/../..|//strong[contains(text(),'Programme content')]/../preceding-sibling::*[1]/following-sibling::*"
            ).extract()
            if len(modules_en) == 0:
                modules_en = response.xpath(
                    "//*[contains(text(),'Modules')]/../../..").extract()
                if len(modules_en) == 0:
                    modules_en = response.xpath(
                        "//*[contains(text(),'Modules')]/../..").extract()
                    if len(modules_en) == 0:
                        modules_en = response.xpath(
                            "//*[contains(text(),'Modules')]/..").extract()
                        if len(modules_en) == 0:
                            modules_en = response.xpath(
                                "//*[contains(text(),'What you will study')]/../../../../../following-sibling::*"
                            ).extract()
                            if len(modules_en) == 0:
                                modules_en = response.xpath(
                                    "//*[contains(text(), 'Subject guide and modules')]/../../../../../../div/following-sibling::div[1]"
                                ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules_en))
            if item['modules_en'] == "":
                item['modules_en'] = None
            print("item['modules_en'] = ", item['modules_en'])

            career_en = response.xpath(
                "//*[contains(text(),'Your future career')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Your future career opportunities')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Career opportunities')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Professional development programme')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Career Prospects')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Professional Development Programme')]/../../../../div/following-sibling::*|"
                # "//*[contains(text(),'Professional Development Programme')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Career prospects')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Career Opportunities')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Graduate destinations')]/../../../../../following-sibling::*|"
                "//a[contains(text(),'Career')]/../../../../../following-sibling::*|"
                "//a[contains(text(),'Personal Development')]/../../../../../following-sibling::*|"
                "//*[contains(text(),'Professional accreditation')]/../../../../../following-sibling::*"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en'] = ", item['career_en'])
            # if item['career_en'] == "":
            #     print("*** career")

            # following-sibling::*
            assessment_en = response.xpath(
                "//a[@id='learning'][contains(text(),'Learning, teaching & assessment')]/../..|"
                "//a[@class='panel-event'][contains(text(),'Learning, teaching & assessment')]/../../../../../..|"
                "//*[contains(text(),'Learning, Teaching & Assessment')]/../../../../../..|"
                "//*[contains(text(),'Learning, Teaching and Assessment')]/../../../../../..|"
                "//*[contains(text(),'Learning, teaching and assessment')]/../../../../../..|"
                "//*[contains(text(),'Learning, teaching and assessments')]/../../../../../..|"
                "//*[contains(text(),'Learning, teaching & assesment')]/../../../../../.."
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en'] = ", item['assessment_en'])

            rntry_requirements = response.xpath(
                "//*[contains(text(),'Entry requirements & fees')]/../../../../../following-sibling::*//text()|"
                "//*[contains(text(),'Entry Requirements & Fees')]/../../../../../following-sibling::*//text()|"
                "//*[contains(text(),'Key information and entry requirements')]/../../../../../..//text()|"
                "//*[contains(text(),'Key information for applicants & entry requirements')]/../../../../../following-sibling::*//text()|"
                "//*[contains(text(),'Entry requirements')]/../../../../../following-sibling::*//text()|"
                "//*[contains(text(),'Entry Requirements')]/../../../../../following-sibling::*//text()"
            ).extract()
            start_date = rntry_requirements
            # print("start_date: ", start_date)
            item['apply_desc_en'] = "<div>" + clear_lianxu_space(
                rntry_requirements) + "</div>"
            # print("item['apply_desc_en'] = ", item['apply_desc_en'])
            clear_space(start_date)
            duration_str = '; '.join(start_date)

            tuition_fee = response.xpath(
                "//*[contains(text(),'Fees')]/../../../../../following-sibling::*//text()|"
                "//*[contains(text(),'fees')]/../../../../../following-sibling::*//text()|"
                "//strong[contains(text(),'Tuition fees')]/..//text()"
            ).extract()
            if len(tuition_fee) == 0:
                tuition_fee = response.xpath(
                    "//strong[contains(text(),'Fees:')]/../following-sibling::*[1]//text()"
                ).extract()
            clear_space(tuition_fee)
            tuition_fee_str = ''.join(tuition_fee)
            # print("tuition_fee_str: ", tuition_fee_str)
            tuition_fee_re = re.findall(
                r"International.*?£\d+,\d+|non-EU.*?£\d+,\d+|MSc.*?£\d+,\d+|entry:£\d+,\d+|2018/2019:£\d+,\d+|£\d+,\d+\sfor\sOutside\sEU|£\d+,\d+",
                tuition_fee_str, re.I)
            tuition_fee_re += re.findall(r"£\d+,\d+", duration_str)
            # print("tuition_fee_re: ", tuition_fee_re)
            if len(tuition_fee_re) != 0:
                t = re.findall(r"\d+,\d+", ''.join(tuition_fee_re))
                # item['tuition_fee'] = int(''.join(t).replace(",", "").strip())
                # print("item['tuition_fee']1 = ", item['tuition_fee'])
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee_re))
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee'] = ", item['tuition_fee'])
            # print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            # duration = response.xpath(
            #     "//*[contains(text(),'Duration')]/following-sibling::*//text()|"
            #     "//*[contains(text(),'Duration')]/..//text()").extract()
            # ((One)|(Two)|(Three)|(Four)|(Five)|(Six)|(Seven)|(Eight)|(Nine)|(Ten).{1,8}year)
            duration_re = re.findall(
                r'Duration.{1,85}|\d.{1,8}year|One.{1,8}year|Two.{1,8}year|Three.{1,8}year|Four.{1,8}year|Five.{1,8}year|Six.{1,8}year|Seven.{1,8}year|Eight.{1,8}year|Nine.{1,8}year|Ten.{1,8}year',
                duration_str, re.I)
            duration_re += re.findall(
                r'Duration.{1,80}|\d.{1,8}year|One.{1,8}year|Two.{1,8}year|Three.{1,8}year|Four.{1,8}year|Five.{1,8}year|Six.{1,8}year|Seven.{1,8}year|Eight.{1,8}year|Nine.{1,8}year|Ten.{1,8}year',
                item['overview_en'], re.I)
            # if len(duration) == 0:
            #     duration = response.xpath("//*[contains(text(),'Duration of course')]/../following-sibling::*[1]//text()").extract()
            clear_space(duration_re)
            duration_str = ' '.join(duration_re)
            # print("duration_str: ", duration_str)
            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            # https://www2.aston.ac.uk/about/termdates/2019-2020
            start_date_str = '; '.join(start_date)
            # print("start_date_str: ", start_date_str)
            start_date_re = re.findall(r'Start.{1,25}', start_date_str)
            # print("start_date_re", start_date_re)
            item['start_date'] = getStartDate(''.join(start_date_re))
            # print("item['start_date']: ", item['start_date'])

            # ielts_desc = ' '.join(start_date)
            # ielts_desc = re.findall(r'.{1,80}IELTS.{1,80}', ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            allcontent = response.xpath(
                "//div[@class='tabbed-zone-outer oAccordionPanels tabbed-zone-sigma']//text() | //div[@class='tabbed-zone-outer oAccordionPanels tabbed-zone-rho']//text() | //div[@class='tabbed-zone-outer oAccordionPanels tabbed-zone-delta']//text() | //div[@class='tabbed-zone-outer oAccordionPanels tabbed-zone-sigma'][2]//text() | //div[@class='tabbed-zone-outer oAccordionPanels tabbed-zone-upsilon']//text()"
            ).extract()
            clear_space(allcontent)
            department_1 = response.xpath(
                "//a[@href='/study/postgraduate/taught-programmes/abs/']//text()"
            ).extract()
            # print(department_1)
            if len(department_1) > 0:
                item['department'] = ''.join(department_1[0]).strip()
            department_re = re.findall(
                r"Life\s&\sHealth\sSciences\s-\sOSPAP|Aston\sBusiness\sSchool|Engineering\s&\sApplied\sScience|Languages\s&\sSocial\sSciences|Life\s&\sHealth\sSciences",
                ''.join(allcontent))
            # print("department_re: ", department_re)
            if item['department'] == "":
                if len(department_re) > 0:
                    item['department'] = ''.join(department_re[0]).strip()
            # print("item['department']: ", item['department'])

            # Aston Business School
            de_1 = [
                "full time mba",
                "executive mba - part time",
                "online mba",
                "the executive dba",
                "phd programme",
                "msc business analytics",
                "msc business & management",
                "msc business & management (online)",
                "msc information systems & business analysis",
                "msc supply chain management",
                "msc international business",
                "msc international accounting & finance",
                "msc international accounting & finance (online)",
                "msc strategy and international business",
                "msc entrepreneurship",
                "msc accounting & finance",
                "msc business economics & finance",
                "msc finance",
                "msc international accounting & finance",
                "msc international accounting & finance (online)",
                "msc investment analysis",
                "msc strategic marketing management ",
                "msc human resource management & business",
                "msc organisational behaviour",
                "msc work psychology & business",
                "international pre-masters",
            ]
            #Engineering & Applied Science
            de_2 = [
                "msc professional engineering",
                "msc computer science",
                "msc software engineering ",
                "msc software project management",
                "msc professional engineering",
                "msc electrical power engineering and systems ",
                "msc telecommunications systems",
                "msc wireless communications and networking",
                "msc smart telecom and sensing networks (smartnet)",
                "msc photonic integrated circuits, sensors and networks (pixnet)",
                "msc professional engineering",
                "msc engineering management",
                "msc supply chain management",
                "msc engineering leadership & management",
                "msc supply chain leadership and management",
                "msc professional engineering",
                "msc mechanical engineering ",
                "msc product design ",
                "msc professional engineering",
            ]
            #Languages & Social Sciences
            de_3 = [
                "ma in forensic linguistics",
                "ma in the european union & international relations",
                "joint ma in multilevel governance & international relations",
                "double ma in europe & the world",
                "double ma in governance and international politics",
                "ma in international relations and global governance",
                "ma in sociology and social research",
                "ma in policy and social research",
                "ma in teaching english to speakers of other languages (tesol)",
                "ma in tesol and translation studies",
                "ma in tesol and translation studies",
                "ma in translation in a european context",
            ]
            # Life & Health Sciences
            de_4 = [
                "advanced hearing therapy practice - msc",
                "clinical science (neurosensory sciences) - msc",
                "doctor of hearing therapy - professional doctorate",
                "biomedical science - msc",
                "biomedical sciences top modules - all standalone modules",
                "stem cells and regenerative medicine - msc",
                "clinical neurophysiology practice - msc",
                "clinical science (neurosensory sciences) - msc",
                "neurophysiology - pgcert",
                "clinical science (neurosensory sciences) - msc",
                "doctor of optometry / doctor of ophthalmic science - professional doctorate",
                "graduate diploma in optometry - graduate diploma",
                "independent prescribing for optometrists - professional accreditation",
                "optometry / ophthalmic science - msc",
                "overseas pharmacists course (ospap) - full time pgdip / msc",
                "pharmacist independent prescribing - pgcert",
                "pharmacy (includes: msc pharmaceutical sciences, msc drug delivery, and msc pharmacokinetics) – msc",
                "psychiatric pharmacy by distance learning and practice - pgdip",
                "psychiatric pharmacy practice - msc",
                "psychiatric therapeutics by distance learning - pgcert",
                "cognitive neuroscience - msc",
                "health psychology (online) - msc",
                "health psychology (on campus) - msc",
            ]
            if item['department'] == "":
                for de1 in de_1:
                    if item['programme_en'] == de1:
                        item['department'] = "Aston Business School"
                        break
            if item['department'] == "":
                for de2 in de_2:
                    if item['programme_en'] == de2:
                        item['department'] = "Engineering & Applied Science"
                        break
            if item['department'] == "":
                for de3 in de_3:
                    if item['programme_en'] == de3:
                        item['department'] = "Languages & Social Sciences"
                        break
            if item['department'] == "":
                for de4 in de_4:
                    if item['programme_en'] == de4:
                        item['department'] = " Life & Health Sciences"
                        break

            if 'business' in item['programme_en'].lower():
                item['department'] = "Aston Business School"
            if 'electrical' in item['programme_en'].lower(
            ) or 'engineering' in item['programme_en'].lower():
                item['department'] = "Engineering & Applied Science"
            # print("item['department']1: ", item['department'])

            if item['department'] == "Aston Medical School":
                item['ielts'] = 7.5
                item['ielts_l'] = 7
                item['ielts_s'] = 7
                item['ielts_r'] = 7
                item['ielts_w'] = 7
                item['toefl'] = 109
                item['toefl_l'] = 26
                item['toefl_r'] = 26
                item['toefl_s'] = 23
                item['toefl_w'] = 28
            elif item['department'] == "Engineering & Applied Science" or item[
                    'department'] == "Languages & Social Sciences":
                item['ielts'] = 6
                item['ielts_l'] = 5.5
                item['ielts_s'] = 5.5
                item['ielts_r'] = 5.5
                item['ielts_w'] = 5.5
                item['toefl'] = 78
                item['toefl_l'] = 11
                item['toefl_r'] = 12
                item['toefl_s'] = 17
                item['toefl_w'] = 20
            else:
                item['ielts'] = 6.5
                item['ielts_l'] = 6
                item['ielts_s'] = 6
                item['ielts_r'] = 6
                item['ielts_w'] = 6
                item['toefl'] = 93
                item['toefl_l'] = 19
                item['toefl_r'] = 18
                item['toefl_s'] = 19
                item['toefl_w'] = 23
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #                 item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #        item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="tab-inner">
<div id="panelGroupHeader_73655987" class="tab-header-outer">
<div class="tab-header-inner">
<ul>
<li class="header"><h2><a href="#" class="panel-event">Undergraduate</a></h2></li>
<li class="expander"><a href="/international-students/your-country/east-asia/china/#" class="panel-event">Expand / Collapse</a>
</li>
</ul>
</div>
</div>
<div id="panelGroupBody_73655987" class="tab-body-outer">
<div class="tab-body-inner"><div class="ContentEditor"><p> <span style="line-height: 1.4em">Students who have achieved an average of 80% in the academic subjects in their Senior High School Leaving Certificate after 3 years of study may be considered for a Foundation programme.  </span><span style="line-height: 1.4em"> </span></p> <p>Students with 2 or 3 year University or College Diploma can be considered for undergraduate study - Year 1 entry. Applicants should be scoring a min of 80% average in relevant  academic subjects. <br /> <br />University students who have studied 1-2 years (full-time) at a recognised university may be eligible for first year entry, dependent on subjects, institution and grades. <span style="line-height: 1.4em"> </span></p> </div>
</div>
</div>
</div>
"""
                ]))
            ucascode = response.xpath(
                "//strong[contains(text(),'UCAS Code')]/following-sibling::*[1]//text()|"
                "//strong[contains(text(),'UCAS Code')]/../text()|"
                "//div[@class='ContentEditor']//strong[3]/../text()|"
                "//strong[contains(text(),'UCAS code')]/..//text()").extract()
            clear_space(ucascode)
            print("ucascode: ", ucascode)
            if len(ucascode) > 0:
                for u in ucascode:
                    if len(u.replace(":", "").replace(".", "").strip()) == 4:
                        item['ucascode'] = u.replace(":",
                                                     "").replace(".",
                                                                 "").strip()
                        break
            print("item['ucascode']: ", item['ucascode'])

            # print(ucascode[:7])
            # if len(item['ucascode']) != 4 and len(ucascode) > 0:
            #     for u in ucascode[:6]:
            #         ucascode_uu_re = re.findall(r"[A-Z]\w{3}", u)
            #         print("ucascode_uu_re: ", ucascode_uu_re)

            if item['ucascode'] == "":
                ucascode_re = re.findall(r"UCAS\sCode.{1,8}", duration_str,
                                         re.I)
                if len(ucascode_re) == 0:
                    ucascode_re = re.findall(
                        r"UCAS\sCode.{1,8}",
                        remove_tags(clear_lianxu_space([response.text])), re.I)
                print("ucascode_re: ", ucascode_re)
                item['ucascode'] = ''.join(ucascode_re).replace(
                    "UCAS Code",
                    "").replace("UCAS code", "").replace(".", "").replace(
                        "(", "").replace(";", "").replace(":", "").strip()
            print("item['ucascode']1: ", item['ucascode'])
            if item['ucascode'] == "":
                print("**** ucascode")

            # if "/" in item['degree_name']:
            #     print("//////////")
            if item['programme_en'] == "Optometry":
                item['degree_name'] = "Bsc"
                item['ucascode'] = 'B510'
                yield item
                item['degree_name'] = "MOptom"
                item['ucascode'] = 'B512'
                yield item
            elif item['programme_en'] == "Biomedical Engineering":
                item['degree_name'] = "BEng"
                item['ucascode'] = 'H542'
                yield item
                item['degree_name'] = "MEng"
                item['ucascode'] = 'H541'
                yield item
            elif item['programme_en'] == "Psychology":
                item['duration'] = 3
                item['ucascode'] = 'C800'
                yield item
                item['duration'] = 4
                item['ucascode'] = 'C801'
                yield item
            elif item[
                    'programme_en'] == "International Relations and Modern Languages (French/German/Spanish)":
                item['ucascode'] = 'LR2C'
                yield item
                item['ucascode'] = 'LR2G'
                yield item
                item['ucascode'] = 'LR2K'
                yield item
            elif item[
                    'programme_en'] == "International Business and Modern Languages":
                item['ucascode'] = 'NR11'
                yield item
                item['ucascode'] = 'NR12'
                yield item
                item['ucascode'] = 'NR14'
                yield item
                item['ucascode'] = 'NR24'
                yield item
                item['ucascode'] = 'NR33'
                yield item
                item['ucascode'] = 'NR44'
                yield item
                item['ucascode'] = 'NR15'
                yield item
            else:
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 8

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.uclan.ac.uk/"
        item['university'] = "University of Central Lancashire"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        # item['location'] = 'Hope Park, Liverpool, L16 9JD'
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            # 专业、学位类型
            programme = response.xpath(
                "//div[@id='TopGraphic']/div[@class='twelvecol last']/h2/text()"
            ).extract()
            if len(programme) == 0:
                programme = response.xpath(
                    "//div[@class='marketing-version']/div[@class='course-title']/h1/text()"
                ).extract()
            clear_space(programme)
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath(
                "//div[@id='TopGraphic']/div[@class='twelvecol last']/h2/span/text()"
            ).extract()
            if len(degree_type) == 0:
                degree_type = response.xpath(
                    "//div[@class='marketing-version']/div[@class='course-title']/h1/span/text()"
                ).extract()
            clear_space(degree_type)
            item['degree_name'] = ''.join(degree_type).replace("(Hons)",
                                                               "").strip()
            print("item['degree_name']: ", item['degree_name'])

            if "Foundation" not in item['programme_en']:
                department = response.xpath(
                    "//h4[contains(text(),'Contact Us')]/following-sibling::*[1]//text()"
                ).extract()
                clear_space(department)
                # print("department: ", department)
                if len(department) > 0:
                    for d in department:
                        if "This course is based in the" in d:
                            item['department'] = d.replace(
                                "This course is based in the", "").strip()
                            break
                # item['department'] = ''.join(department)
                # print("item['department']: ", item['department'])

                duration = response.xpath(
                    "//h4[contains(text(), 'Duration:')]/..//text()").extract(
                    )
                clear_space(duration)
                # print("duration: ", duration)
                duration_str = ''.join(duration).strip()
                item['other'] = duration_str

                duration_list = getIntDuration(duration_str)
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
                # print("item['duration'] = ", item['duration'])
                # print("item['duration_per'] = ", item['duration_per'])

                location = response.xpath(
                    "//h4[contains(text(), 'Campus')]/following-sibling::p[1]//text()"
                ).extract()
                item['location'] = ''.join(location)
                # print("item['location']", item['location'])

                start_date = response.xpath(
                    "//h4[contains(text(), 'Start Date:')]/following-sibling::p[1]//text()"
                ).extract()
                # print(start_date)
                item['start_date'] = getStartDate(''.join(start_date))
                # print("item['start_date']", item['start_date'])

                overview = response.xpath(
                    "//div[@class='overview']|//div[@id='outline']/div[position()<3]"
                ).extract()
                item['overview_en'] = remove_class(
                    clear_lianxu_space(overview))
                # print("item['overview_en']", item['overview_en'])
                # if item['overview_en'] == "":
                #     print("*******111")

                entry_requirements = response.xpath(
                    "//div[@class='sevencol']//div[contains(@class,'entry-requirements')]//text()"
                ).extract()
                clear_space(entry_requirements)
                print(entry_requirements)
                rntry_requirements = ''.join(entry_requirements).strip()

                item['alevel'] = None
                if "BTEC Extended Diploma:" in entry_requirements:
                    alevel = entry_requirements[:entry_requirements.index(
                        "BTEC Extended Diploma:")]
                    item['alevel'] = clear_lianxu_space(alevel)
                elif "BTEC Extended Diploma:\xa0" in entry_requirements:
                    alevel = entry_requirements[:entry_requirements.index(
                        "BTEC Extended Diploma:\xa0")]
                    item['alevel'] = clear_lianxu_space(alevel)
                elif "BTEC Extended Diploma" in entry_requirements:
                    alevel = entry_requirements[:entry_requirements.
                                                index("BTEC Extended Diploma")]
                    item['alevel'] = clear_lianxu_space(alevel)
                elif "QCFBED:" in entry_requirements:
                    alevel = entry_requirements[:entry_requirements.
                                                index("QCFBED:")]
                    item['alevel'] = clear_lianxu_space(alevel)
                elif "A Levels" in entry_requirements:
                    alevel = entry_requirements[
                        entry_requirements.index("A Levels") + 1]
                    item['alevel'] = clear_lianxu_space([alevel])
                if item['alevel'] is not None:
                    item['alevel'] = item['alevel'].strip().strip(":").strip()
                print("item['alevel']: ", item['alevel'])

                item['ib'] = None
                if "International Baccalaureate:" in entry_requirements:
                    ib = entry_requirements[entry_requirements.index(
                        "International Baccalaureate:") + 1]
                    item['ib'] = clear_lianxu_space([ib])
                elif "International Baccalaureate Diploma:\xa0" in entry_requirements:
                    ib = entry_requirements[entry_requirements.index(
                        "International Baccalaureate Diploma:\xa0") + 1]
                    item['ib'] = clear_lianxu_space([ib])
                elif "International Baccalaureate :" in entry_requirements:
                    ib = entry_requirements[entry_requirements.index(
                        "International Baccalaureate :") + 1]
                    item['ib'] = clear_lianxu_space([ib])
                elif "International Baccalaureate:\xa0" in entry_requirements:
                    ib = entry_requirements[entry_requirements.index(
                        "International Baccalaureate:\xa0") + 1]
                    item['ib'] = clear_lianxu_space([ib])
                elif "International Baccalaureate" in entry_requirements:
                    ib = entry_requirements[entry_requirements.index(
                        "International Baccalaureate") + 1]
                    item['ib'] = clear_lianxu_space([ib])
                elif "International Baccalaureate Diploma:" in entry_requirements:
                    ib = entry_requirements[entry_requirements.index(
                        "International Baccalaureate Diploma:") + 1]
                    item['ib'] = clear_lianxu_space([ib])
                if item['ib'] is not None:
                    item['ib'] = item['ib'].strip().strip(":").strip()
                print("item['ib']: ", item['ib'])
                # if item['ib'] == "":
                #     print("*******111")

                if "IELTS:" in entry_requirements:
                    ieltsList = entry_requirements[entry_requirements.index(
                        "IELTS:"):entry_requirements.index("IELTS:") + 2]
                else:
                    ieltsList = re.findall(r'.{1,50}IELTS.{1,80}',
                                           rntry_requirements)
                item['ielts_desc'] = ''.join(ieltsList).strip()
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ielts_list = re.findall(
                    r"[5-9]\.\d\s|[5-9]\.\d,|[5-9]\.\d\.|[5-9]\.\d$|[5-9]\s|[5-9]\.",
                    item['ielts_desc'])
                # print(ielts_list)
                if len(ielts_list) == 1:
                    item['ielts'] = ielts_list[0].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_l'] = ielts_list[0].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_s'] = ielts_list[0].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_r'] = ielts_list[0].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_w'] = ielts_list[0].strip().strip('.').replace(
                        ',', '').strip()
                elif len(ielts_list) == 2:
                    item['ielts'] = ielts_list[0].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_l'] = ielts_list[1].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_s'] = ielts_list[1].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_r'] = ielts_list[1].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_w'] = ielts_list[1].strip().strip('.').replace(
                        ',', '').strip()
                elif len(ielts_list) == 3:
                    item['ielts'] = ielts_list[0].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_l'] = ielts_list[0].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_s'] = ielts_list[0].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_r'] = ielts_list[1].strip().strip('.').replace(
                        ',', '').strip()
                    item['ielts_w'] = ielts_list[2].strip().strip('.').replace(
                        ',', '').strip()
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                modules = response.xpath("//div[@id='caag']").extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # print("item['modules_en']", item['modules_en'])
                # if item['modules_en'] == "":
                #     print("*******111")

                # //h3[contains(text(),'Learning Environment and Assessment')]/..
                # assessment_en = response.xpath("//*[contains(text(),'Learning Environment and Assessment')]/..").extract()
                assessment_en = response.xpath(
                    "//*[contains(text(),'Learning Environment and Assessment')]|//*[contains(text(),'Learning Environment and Assessment')]/following-sibling::p"
                ).extract()
                item['assessment_en'] = remove_class(
                    clear_lianxu_space(assessment_en))
                # print("item['assessment_en']", item['assessment_en'])
                # if item['assessment_en'] == "":
                #     print("*******111")

                # //h4[contains(text(),'Industry Links')]|//h4[contains(text(),'Industry Links')]/following-sibling::*[1]
                career_en = response.xpath(
                    "//h4[contains(text(),'Industry Links')]/..|//h4[contains(text(),'Opportunities')]/..|"
                    "//strong[contains(text(),'Careers')]/../..").extract()
                item['career_en'] = remove_class(clear_lianxu_space(career_en))
                # print("item['career_en']", item['career_en'])
                # if item['career_en'] == "":
                #     print("*******career")

                # https://www.uclan.ac.uk/study_here/fees_and_finance/international_tuition_fees.php#international
                item['tuition_fee'] = '12450'
                if item['department'] == "School of Forensic and Applied Sciences" or item['department'] == "School of Physical Sciences and Computing" \
                        or item['department'] == "School of Pharmacy and Biomedical Sciences" or item['department'] == "School of Engineering":
                    item['tuition_fee'] = '13450'
                item['tuition_fee_pre'] = "£"

                item[
                    'require_chinese_en'] = "<h2>Undergraduate – Year 0 entry 200 tariff points/80 tariff points</h2><p>Senior Secondary School Graduation Certificate 60% average</p><p>Completion of second year Senior Secondary School 70% average</p><h2>Undergraduate - Year 1 entry 280 tariff points/112 tariff points</h2><p>3 Year leaving certificate from SeniorHigh School with 75%</p><p>Chinese National University GaoKao University Entrance Test with 450+ (maximum is 750, 150 for each of five subjects)</p><p>Successful completion of one year Higher Education is acceptable in lieu of Grade 3 High School at 85%</p><p>Completion of Shenzhen University International Foundation Programme with 60% - Group B</p>"
                item[
                    'apply_proces_en'] = 'https://www.uclan.ac.uk/study_here/how_to_apply/international.php'

                ucascode = response.xpath(
                    "//h4[contains(text(),'UCAS Code:')]/following-sibling::*//text()"
                ).extract()
                clear_space(ucascode)
                # print("ucascode: ", ucascode)
                item['ucascode'] = ''.join(ucascode).strip()
                # print("len: ", len(ucascode))
                # print("item['ucascode'] = ", item['ucascode'])

                if item['programme_en'] != "":
                    yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 9

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.strath.ac.uk/"
        item["university"] = "University of Strathclyde"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = "16 Richmond Street, Glasgow, G1 1XQ"
        print("===========================")
        print(response.url)
        try:
            # 学位类型
            degree_type = response.xpath(
                "//main[@id='content']/section[@class='PGtPage']/header[@class='page-summary has-img']/div[@class='wrap']/h1/span/text()"
            ).extract()
            item['degree_name'] = ''.join(degree_type).strip()
            print("item['degree_name'] = ", item['degree_name'])

            # 专业名
            programme = response.xpath(
                "//main[@id='content']/section[@class='PGtPage']/header[@class='page-summary has-img']/div[@class='wrap']/h1/text()"
            ).extract()
            # print("programme = ", programme)
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en'] = ", item['programme_en'])

            if "Engineering" in item['programme_en']:
                item['department'] = "Faculty of Engineering"
            elif "Science" in item['programme_en']:
                item['department'] = "Faculty of Science"
            elif "Business" in item['programme_en'] or "Finance" in item[
                    'programme_en'] or "Marketing" in item['programme_en']:
                item['department'] = "Strathclyde Business School"
            print("item['department'] = ", item['department'])

            ucascode = response.xpath(
                "//strong[contains(text(),'UCAS code:')]/../text()").extract()
            clear_space(ucascode)
            if len(ucascode) > 0:
                item['ucascode'] = ''.join(ucascode[0]).strip()
            print("item['ucascode']: ", item['ucascode'])

            # 课程长度、开学时间、截止日期
            durationTeachtime = response.xpath(
                "//b[contains(text(),'Study mode and duration')]/../text()"
            ).extract()
            clear_space(durationTeachtime)
            print("durationTeachtime: ", durationTeachtime)
            durationTeachtimeStr = ''.join(durationTeachtime)

            duration_list = getIntDuration(durationTeachtimeStr)
            # print(duration_list)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            start_date = response.xpath(
                "//b[contains(text(),'Start date')]/../text()").extract()
            start_date_str = ''.join(start_date).replace(":", "")
            # print("start_date_str = ", start_date_str)
            item['start_date'] = getStartDate(start_date_str)
            if item['start_date'] != "" and item[
                    'start_date'] > "06" and "2018" not in item[
                        'start_date'] and "2019" not in item['start_date']:
                item['start_date'] = "2018-" + item['start_date']
            elif item['start_date'] != "" and item[
                    'start_date'] <= "06" and "2018" not in item[
                        'start_date'] and "2019" not in item['start_date']:
                item['start_date'] = "2019-" + item['start_date']
            # print("item['start_date'] = ", item['start_date'])

            # 截止日期
            deadline = response.xpath(
                "//b[contains(text(),'Application deadline')]/../text()"
            ).extract()
            deadline = ''.join(start_date).replace(":", "").strip()
            # print("deadline = ", deadline)
            item['deadline'] = getStartDate(deadline)
            # print("item['deadline'] = ", item['deadline'])

            # 专业描述
            overview = response.xpath(
                "//article[@id='why-this-course']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en'] = ", item['overview_en'])

            # 课程设置、评估方式
            modules = response.xpath(
                "//h3[contains(text(),'Assessment')]/preceding-sibling::*"
            ).extract()
            if len(modules) == 0:
                modules = response.xpath(
                    "//h3[contains(text(),'Learning & teaching')]/preceding-sibling::*"
                ).extract()
                if len(modules) == 0:
                    modules = response.xpath(
                        "//article[@id='course-content']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en'] = ", item['modules_en'])

            assessment_en = response.xpath(
                "//h3[contains(text(),'Assessment')]/preceding-sibling::*[1]/following-sibling::*"
            ).extract()
            if len(assessment_en) == 0:
                assessment_en = response.xpath(
                    "//h3[contains(text(),'Learning & teaching')]/preceding-sibling::*[1]/following-sibling::*"
                ).extract()
            item["assessment_en"] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en'] = ", item['assessment_en'])

            # 学术要求、英语要求
            rntry_requirements = response.xpath(
                "//article[@id='entry-requirements']//text()").extract()
            # item["rntry_requirements"] = clear_lianxu_space(rntry_requirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])

            alevel = response.xpath(
                "//h4[contains(text(),'A Levels')]/following-sibling::p[1]//text()|"
                "//strong[contains(text(),'A Levels')]/..//following-sibling::p[1]//text()"
            ).extract()
            if len(alevel) == 0:
                alevel = response.xpath(
                    "//*[contains(text(),'A Levels')]/../following-sibling::*[1]//text()"
                ).extract()
            clear_space(alevel)
            # print(alevel)
            if len(alevel) > 0:
                item['alevel'] = ''.join(alevel[0]).strip()
            # print("item['alevel'] = ", item['alevel'])
            if len(item['alevel']) > 160:
                item['alevel'] = ''.join(item['alevel'][:161])
            # print("item['alevel']1 = ", item['alevel'])

            item['alevel'] = None
            from lxml import etree
            tmp_html = response.text
            key1 = "<h4>A Levels"
            if key1 not in tmp_html:
                key1 = "<h4><strong>A Levels"
                if key1 not in tmp_html:
                    key1 = "<p><strong>A Levels"
                    if key1 not in tmp_html:
                        key1 = '<h4><span style="font-size: 1em;">A Levels'
                        if key1 not in tmp_html:
                            key1 = '<h4 class="p1">A Levels'
                            if key1 not in tmp_html:
                                key1 = '<p>Year 1 entry<u1:p>'
            key2 = "<h4>International Baccalaureate"
            if key2 not in tmp_html:
                key2 = "<h4><strong>International Baccalaureate"
                if key2 not in tmp_html:
                    key2 = '<p><strong>International Baccalaureate'
                    if key2 not in tmp_html:
                        key2 = '<h4 class="p1">European Baccalaureate'
                        if key2 not in tmp_html:
                            key2 = '<h4 class="p1">International Baccalaureate'
                            if key2 not in tmp_html:
                                key2 = '<h4>IB'
                                if key2 not in tmp_html:
                                    key2 = '<h4><span style="font-size: 1em;">International Baccalaureate'
                                    if key2 not in tmp_html:
                                        key2 = '<h4>HND/HNC</h4>'
            if key1 in tmp_html and key2 in tmp_html:
                # 使用正则匹配需要的标签内容的首尾
                key1_re = re.findall(key1, tmp_html)
                print("key1_re: ", key1_re)
                key2_re = re.findall(key2, tmp_html)
                print("key2_re: ", key2_re)
                # 增加能准确获取的div
                end_html = tmp_html.replace(
                    ''.join(key1_re),
                    '<div id="container">' + ''.join(key1_re)).replace(
                        ''.join(key2_re), '</div>' + ''.join(key2_re))

                end_html_response = etree.HTML(end_html)
                # 可以使用xpath匹配需要的内容了
                end_content = end_html_response.xpath(
                    "//div[@id='container']//text()")
                # 转化成带标签的数据内容
                end_content_str = "".join(end_content).strip()
                # if len(end_content) > 0:
                #     end_content_str = etree.tostring(end_content[0], encoding='unicode', method='html')
                item['alevel'] = end_content_str
            print("item['alevel'] = ", item['alevel'])
            # print(len("36 points overall with 18 at Higher Level, including 6, 5 at Higher Level in two of the following subjects: Biology, Chemistry, Physics, Mathematics, Psychology"))

            ib = response.xpath(
                "//h4[contains(text(),'International Baccalaureate')]/following-sibling::p[1]//text()|"
                "//strong[contains(text(),'International Baccalaureate')]/../following-sibling::p[1]//text()"
            ).extract()
            if len(ib) == 0:
                ib = response.xpath(
                    "//*[contains(text(),'International Baccalaureate')]/../following-sibling::*[1]//text()"
                ).extract()
            clear_space(ib)
            if len(ib) > 0:
                item['ib'] = ''.join(ib[0]).strip()

            if len(item['ib']) > 160:
                item['ib'] = ''.join(item['ib'][:161])
            # print("item['ib'] = ", item['ib'])

            # ielts = response.xpath("//h3[contains(text(),'English language requirements')]/following-sibling::*[position()<4]//text()").extract()
            # print("ielts: ", ielts)
            ielts_re = re.findall(r"IELTS.{1,80}", ''.join(rntry_requirements))
            # print("ielts_re = ", ielts_re)
            item["ielts_desc"] = ''.join(ielts_re)
            # print("item['ielts_desc'] = ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            # print(ieltlsrw)
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            if item['ielts'] != None:
                item['ielts'] = item['ielts'].strip('.').strip()
            if item['ielts_l'] != None:
                item['ielts_l'] = item['ielts_l'].strip('.').strip()
            if item['ielts_s'] != None:
                item['ielts_s'] = item['ielts_s'].strip('.').strip()
            if item['ielts_r'] != None:
                item['ielts_r'] = item['ielts_r'].strip('.').strip()
            if item['ielts_w'] != None:
                item['ielts_w'] = item['ielts_w'].strip('.').strip()
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
            #       %(item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            # 学费    //article[@id='fees-and-funding']/ul[3]/li
            tuition_fee = response.xpath(
                "//html//article[@id='fees-and-funding']/*[contains(text(),'International')]/following-sibling::*[1]//text()"
            ).extract()
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"£[\d,]+", ''.join(tuition_fee))
            # print(tuition_fee_re)
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = ''.join(tuition_fee_re[0]).replace(
                    "£", "").replace(",", "")
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee'] = ", item['tuition_fee'])

            # 就业    //article[@id='careers']
            career = response.xpath("//article[@id='careers']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en'] = ", item['career_en'])

            # //article[@id='apply']
            apply_proces_en = response.xpath(
                "//article[@id='apply']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(apply_proces_en))
            # print("item['apply_proces_en'] = ", item['apply_proces_en'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h2>Entry requirements</h2>
              <h3>Undergraduate</h3>
<p><strong>Undergraduate first year entry (four-year degree programme)</strong>&nbsp;</p>
<p>Huikao 80% on average, and at least 80% in the required subjects.</p>
<p>Provinces with no Huikao: 80% on average over six subjects, including required subjects.</p>
<p><strong>Undergraduate second year entry (three-year degree programme)</strong>&nbsp;</p>
<p>Huikao 80% on average, and at least 80% in the required subjects.</p>
<p>Provinces with no Huikao: 80% on average over six subjects, including required subjects.</p>
<div>Gaokao with at least 525 out of 750, or 70% (If the total mark is not out of 750).</div>"""
                ]))
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 10

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.uwl.ac.uk/"
        item['university'] = "University of West London"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            programmeDegreetype = response.xpath(
                "//h1[@id='page-title']//text()").extract()
            # print("programmeDegreetype: ", programmeDegreetype)
            programmeDegreetypeStr = ''.join(programmeDegreetype)

            degree_type = re.findall(r"^\w+\s\(Hons\)|^\(\w+\)|^\w+", programmeDegreetypeStr)
            # print("degree_type: ", degree_type)
            degree_type_str = ''.join(degree_type).strip()
            item['degree_name'] = ''.join(degree_type).replace("(Hons)", "").strip()
            print("item['degree_name']: ", item['degree_name'])

            item['programme_en'] = programmeDegreetypeStr.replace(degree_type_str, '').strip()
            print("item['programme_en']: ", item['programme_en'])

            mode = response.xpath("//dt[contains(text(), 'Study mode')]/following-sibling::dd[1]//text()").extract()
            clear_space(mode)
            # print("mode: ", mode)

            location = response.xpath("//dt[contains(text(), 'Location')]/following-sibling::dd[1]//text()").extract()
            item['location'] = ''.join(location).replace("See location information", "").strip()
            # print("item['location']: ", item['location'])

            start_date = response.xpath("//dt[contains(text(), 'Start date')]/following-sibling::dd[1]//text()").extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            item['start_date'] = getStartDate(''.join(start_date))
            # print("item['start_date']: ", item['start_date'])

            duration = response.xpath("//dt[contains(text(), 'Duration')]/following-sibling::dd[1]//text()").extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            ucascode = response.xpath(
                "//dt[contains(text(), 'UCAS code')]/following-sibling::dd[1]//text()").extract()
            clear_space(ucascode)
            item['ucascode'] = ''.join(ucascode).strip()
            # print("item['ucascode'] = ", item['ucascode'])

            department = response.xpath("//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()").extract()
            item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            tuition_fee = response.xpath("//h4[contains(text(),'Overseas students')]/following-sibling::dl[1]//dt[contains(text(), 'Main fee')]/following-sibling::dd[1]//text()").extract()
            # print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //div[@id='course-detail']
            modules = response.xpath("//div[@id='course-detail']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            # //div[@id='course-detail']
            entry_requirements = response.xpath("//div[@id='entry-requirements']//text()").extract()
            # item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            alevel = response.xpath("//*[contains(text(),'A Level')]//text()|//*[contains(text(),'A level')]//text()").extract()
            item['alevel'] = clear_lianxu_space(alevel)
            # print("item['alevel']: ", item['alevel'])

            ielts_desc = response.xpath("//div[@id='entry-requirements']//*[contains(text(),'IELTS')]/text()").extract()
            clear_space(ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            ielts_desc_re = re.findall(r'.{1,50}IELTS.{1,50}', ''.join(ielts_desc))
            # print("ielts_desc_re: ", ielts_desc_re)
            if len(ielts_desc_re) > 0:
                item['ielts_desc'] = ielts_desc_re[-1]
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            how_to_apply = response.xpath("//div[@id='how-to-apply']").extract()
            item['apply_proces_en'] = remove_class(clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            assessment_en = response.xpath("//h3[contains(text(),'Teaching methods')]/preceding-sibling::*[1]/following-sibling::*[position()<5]|"
                                           "//*[contains(text(),'Assessment')]/preceding-sibling::*[1]/following-sibling::*[position()<5]|"
                                           "//html//div/strong[contains(text(),'How will I be taught?')]/..|"
                                           "//strong[contains(text(),'course be assessed?')]/..|//strong[contains(text(),'course be assessed?')]/../following-sibling::div").extract()
            item['assessment_en'] = remove_class(clear_lianxu_space(assessment_en))
            print("item['assessment_en']: ", item['assessment_en'])

            career_en = response.xpath(
                "//div[@id='career-progression-and-study']|"
                "//div[@id='jobs-and-placements']|"
                "//html//*[contains(text(),'Career and study progression')]/../following-sibling::*[position()<5]").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en)).replace("<div></div>","").strip()
            print("item['career_en']: ", item['career_en'])

            overview_en = response.xpath("//div[@id='course-summary']/*[position()<last()]").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # print("item['overview_en']: ", item['overview_en'])

            item['require_chinese_en'] = remove_class(clear_lianxu_space(["""<h3>Undergraduate entry&nbsp;</h3>
<p>Applicants with the following&nbsp;qualiﬁcations&nbsp;will be considered for entry on an undergraduate degree:</p>
<p>&bull;&nbsp;&nbsp; &nbsp; Chinese university / college entrance examination (Gaokao) with an overall average 65% or higher</p>
<p>&bull;&nbsp;&nbsp; &nbsp; Diploma from specialised college (zhongzhuan) with an overall average of 65% or higher</p>
<p>&bull;&nbsp;&nbsp; &nbsp; Graduation Certificate awarded at &lsquo;Zhuanke&rdquo; / &lsquo;Dazhuan&rsquo; level from universities / colleges with an overall 65% or higher</p>
<p>&bull;&nbsp;&nbsp; &nbsp; Graduation Certificate - Sub degree (Gaozhi) with an overall 65% or above</p>
<p>&bull;&nbsp;&nbsp;&nbsp;&nbsp; British/International A Levels (from a minimum 112&nbsp;UCAS&nbsp;tariff points depending on course)</p>
<p>&bull; &nbsp; &nbsp;&nbsp;IB&nbsp;Diploma (from a minimum 25 points depending on course)</p>
<p>&bull;&nbsp;&nbsp; &nbsp; Second and final year entry on a bachelor&#39;s degree may be available for those with a Higher National Diploma (HND) from the UK with a merit profile</p>
<p>&bull;&nbsp;&nbsp;&nbsp;&nbsp; A recognised foundation course</p>"""]))
            yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: UniversityOfSalford_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolEnglandBenItem)
        # item['country'] = "England"
        # item["website"] = "https://www.salford.ac.uk/"
        item['university'] = "University of Salford"
        item['url'] = response.url
        # 学位类型
        item['degree_type'] = 1
        item['location'] = 'The Crescent, Salford, M5 4WT, UK'
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            # 专业、学位类型
            programme = response.xpath("//div[@id='content']/div[@class='col-md-12']/div[@class='course-title']/div[@class='row']/div[@class='col-sm-8 col-md-8']/h1//text()").extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            if "Foundation" not in item['programme_en'] and item['degree_name'] != "Graduate Certificate":
                degree_type = response.xpath("//div[@id='content']/div[@class='col-md-12']/div[@class='course-title']/div[@class='row']/div[@class='col-sm-8 col-md-8']/h2//text()").extract()
                item['degree_name'] = ''.join(degree_type).replace("(Hons)", "").strip()
                print("item['degree_name']: ", item['degree_name'])

                # //div[@id='content']/div[@class='col-md-12']/div[@class='course-title']/div[@class='row']/div[@class='col-sm-8 col-md-8']/p
                department = response.xpath("//strong[contains(text(), 'School -')]/../text()").extract()
                item['department'] = ''.join(department).strip()
                # print("item['department']: ", item['department'])

                start_date = response.xpath("//strong[contains(text(), 'Start Date(s):')]/../text()").extract()
                clear_space(start_date)
                # print("start_date: ", start_date)
                if ";" in ''.join(start_date):
                    start_date = ''.join(start_date).split(";")
                    for s in start_date:
                        item['start_date'] += getStartDate(s) + ","
                else:
                    item['start_date'] = getStartDate(''.join(start_date))
                item['start_date'] = item['start_date'].strip().strip(",").strip()
                # print("item['start_date']: ", item['start_date'])



                # //strong[contains(text(), 'Fees')]/../following-sibling::p[contains(text(), 'International -')]
                tuition_fee = response.xpath("//strong[contains(text(), 'Fees')]/../following-sibling::p[contains(text(), 'International')]//text()").extract()
                clear_space(tuition_fee)
                # print("tuition_fee: ", tuition_fee)
                tuition_fee_re = re.findall(r"£\d+,\d+", ''.join(tuition_fee))
                # print("tuition_fee_re: ", tuition_fee_re)
                if len(tuition_fee_re) > 0:
                    item['tuition_fee'] = getTuition_fee(''.join(tuition_fee_re))
                # print("item['tuition_fee']: ", item['tuition_fee'])

                # //div[@id='content']/div[@class='col-md-12']/div[@class='row']/div[1]
                overview = response.xpath(
                    "//div[@id='content']/div[@class='col-md-12']/div[@class='row']/div[1] | //div[@id='content']/div[@class='row']/div[1]").extract()
                item['overview_en'] = remove_class(clear_lianxu_space(overview))
                # print("item['overview_en']: ", item['overview_en'])

                # //section[@id='about']/div[@id='content']
                modules_en = response.xpath("//div[@id='courseaccordion']").extract()
                if len(modules_en) == 0:
                    # print("********")
                    modules_en = response.xpath("//h2[contains(text(),'Course Details')]/following-sibling::*").extract()
                    if len(modules_en) == 0:
                        modules_en = response.xpath("//h2[contains(text(),'Course Structure')]|//h2[contains(text(),'Course Structure')]/following-sibling::*").extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules_en)) # .replace("&nbsp;", "")
                item['modules_en'] = item['modules_en'].encode('utf-8').decode("unicode-escape").replace("Â ", "")
                # print("item['modules_en']: ", item['modules_en'])

                alevel = response.xpath("//*[contains(text(),'A level')]/following-sibling::td//text()").extract()
                item['alevel'] = clear_lianxu_space(alevel)
                # print("item['alevel']: ", item['alevel'])

                ib = response.xpath("//*[contains(text(),'International Baccalaureate')]/following-sibling::td//text()").extract()
                item['ib'] = clear_lianxu_space(ib)
                # print("item['ib']: ", item['ib'])

                # //section[@id='requirements']/div
                entry_requirements = response.xpath("//section[@id='requirements']/div//text()").extract()
                # item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
                # print("item['rntry_requirements']: ", item['rntry_requirements'])

                # 申请材料
                apply_documents_en = response.xpath("//h3[contains(text(),'Applicant profile')]/preceding-sibling::*[1]/following-sibling::*").extract()
                item['apply_documents_en'] = remove_class(clear_lianxu_space(apply_documents_en))
                # print("item['apply_documents_en']: ", item['apply_documents_en'])

                # //h3[contains(text(),'English Language Requirements')]/following-sibling::*[1]
                ielts_desc = response.xpath("//h3[contains(text(),'English Language Requirements')]/following-sibling::*[position()<3]//text()").extract()
                clear_space(ielts_desc)
                item['ielts_desc'] = ''.join(ielts_desc).replace("Suitable For", "").strip()
                # print("item['ielts_desc']: ",item['ielts_desc'])

                ielts_dict = get_ielts(item['ielts_desc'].replace("level 4, 5, 6 ", "").strip())
                item['ielts'] = ielts_dict.get('IELTS')
                item['ielts_l'] = ielts_dict.get('IELTS_L')
                item['ielts_s'] = ielts_dict.get('IELTS_S')
                item['ielts_r'] = ielts_dict.get('IELTS_R')
                item['ielts_w'] = ielts_dict.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))


                # //section[@id='teaching']/div[@class='container main']/div[@class='col-md-12']/div[@id='teaching_0a19']
                assessment_en = response.xpath("//section[@id='teaching']/div").extract()
                item['assessment_en'] = remove_class(clear_lianxu_space(assessment_en))
                # print("item['assessment_en']: ", item['assessment_en'])

                # //section[@id='employability']/div[@class='container main']/div[@class='col-md-12']/div[@id='employ_0a19']
                career = response.xpath("//section[@id='employability']/div").extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # print("item['career_en']: ", item['career_en'])

                item['apply_proces_en'] = remove_class(clear_lianxu_space(["""<div class="col-xs-12 col-sm-6 col-md-6 col-lg-6"><h1>How to apply?</h1>
</div>
<div class="clearfix"></div>
<div id="content_container_1135928">
<p class="lead">We try to make applying to Salford as flexible and straightforward as possible.</p><p>If you have any queries about the application process, please contact the Course Enquiries Team on 0161 295 4545 (choose option 1) or <a href="mailto:[email protected]">[email protected]</a></p><p>The majority of our undergraduate applications are via UCAS, however there are some exceptions. The normal closing date for UCAS applications for September entry is 15 January (although if you are at school or college, they may ask you to fill it in earlier to give them time to prepare your reference). Applications received after this date may still be considered if the course is not full. If you apply late you are advised to check the UCAS website for course availability first. To find out more about how to apply and the different options available, we have given some guidance below.</p><p><strong>If you're interested in applying through clearing, please see further details below.</strong></p>
</div>
<div class="accordion-inner" id="accordion_1421686" role="tablist" aria-multiselectable="true">
<div class="panel panel-default">
<div class="panel-heading" role="tab" id="1421686_1">
      <h4 class="panel-title">
        <a data-toggle="collapse" data-parent="#accordion_1421686" href="#collapse_1421686_1" aria-expanded="false" aria-controls="collapse_sub_1" class="collapsed">
          Applying through Clearing?
        </a>
      </h4>
    </div>
    <div id="collapse_1421686_1" class="panel-collapse collapse" role="tabpanel" aria-labelledby="1421686_1">
      <div class="panel-body">
<div id="content_container_1615870">
<p>Clearing is applicable to those who have not yet submitted an application or those who are not yet holding any offers. During this time, universities can take applications to fill any course vacancies they may have. UCAS will list course availability so you can take a look on <a href="http://www.ucas.com">www.ucas.com</a> for more information. Despite popular belief, clearing opens at 9.00am on 5 July 2018 so potential applicants needn&rsquo;t wait until Results Day. You can find more information about this process <a href="https://www.ucas.com/undergraduate/results-confirmation-and-clearing/what-clearing">here.</a></p><p>If you'd like to apply to Salford through clearing, or have any questions at all about the process, please give our hotline a call on 0300 555 5030 and one of our friendly team can help you. The lines are open 9.00am to 5.00pm Monday to Thursday and 10.00am to 4.00pm on Friday. Please ensure you have an idea of what course or subject area you&rsquo;re interested in and that you have details of all your current qualifications to hand when you call. This means we'll be able to advise you of the best options to match your interests and qualifications.</p><p>If you are successful in gaining a place through clearing, you will need to either attach to the University via UCAS Track or complete a direct application form, depending on your current circumstances. We can help advise you of this when you call.</p>
</div>
      </div>
    </div>
</div><div class="panel panel-default">
<div class="panel-heading" role="tab" id="1421686_2">
      <h4 class="panel-title">
        <a data-toggle="collapse" data-parent="#accordion_1421686" href="#collapse_1421686_2" aria-expanded="false" aria-controls="collapse_sub_2" class="collapsed">
          Entry requirements
        </a>
      </h4>
    </div>
    <div id="collapse_1421686_2" class="panel-collapse collapse" role="tabpanel" aria-labelledby="1421686_2">
      <div class="panel-body">
<div id="advice_and_guidance">
<p><script>(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){     (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),     m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)     })(window,document,'script','//www.google-analytics.com/analytics.js','ga');     ga('create', 'UA-2643712-2', 'salford.ac.uk');     ga('require', 'displayfeatures');     ga('require', 'linkid', 'linkid.js');     ga('send', 'pageview');</script></p><div id="page"><div><div><div id="content_div_44347"><p>From A level to Access to HE diplomas to International Baccalaureate &ndash; our entry requirements are wide and varied and we recognise a wide range of qualifications from around the world.</p><p>If you are a UK applicant leaving school or college we accept the following qualifications:</p><ul><li>A/AS level</li><li>Scottish Higher/Advanced Higher</li><li>14 to 19 Advanced Diplomas</li><li>the International Baccalaureate</li><li>Welsh Baccalaureate</li><li>BTEC Awards</li><li>AQA Baccalaureate</li><li>Cambridge Pre-U</li><li>Access to Higher Education Diploma</li></ul><p>We&rsquo;re committed to widening access to higher education for all sections of the community and so are as flexible as possible in our admissions policy.</p></div></div></div></div>
</div>
      </div>
    </div>
</div><div class="panel panel-default">
<div class="panel-heading" role="tab" id="1421686_3">
      <h4 class="panel-title">
        <a data-toggle="collapse" data-parent="#accordion_1421686" href="#collapse_1421686_3" aria-expanded="false" aria-controls="collapse_sub_3" class="collapsed">
          The UCAS application process
        </a>
      </h4>
    </div>
    <div id="collapse_1421686_3" class="panel-collapse collapse" role="tabpanel" aria-labelledby="1421686_3">
      <div class="panel-body">
<div id="advice_and_guidance">
<div id="page"><div><div><div id="content_div_44326"></div><h4>APPLYING VIA UCAS</h4></div></div></div><p>You can apply online via the&nbsp;<a href="http://www.ucas.com/students/apply" target="_blank"><u>UCAS website</u></a>. Your school or college should help you to access this service, and will give you a buzzword which you will need to register. You can also use the service if you&rsquo;re applying independently in the UK and overseas.</p><p>Take some time to check your online application before you submit it. It&rsquo;s important that you take care when completing the 'choices' section and use the correct institution codes and programme codes.</p><p>You can apply for up to five programmes, so make sure you think through your options carefully before applying. Also check that the 'education' section displays your correct qualification details. You should then forward your application on to your referee, who will check your application, provide the reference and forward the application to UCAS. If you are applying independently, you will be responsible for obtaining and adding the reference yourself.</p><p>It costs £23 to apply for two-five choices. If, however, you wish to only apply to the University of Salford and you are only applying for one programme, you may use a single entry at a fee of £12. You may pay online using a credit or debit card, or alternatively if you are applying from your school or college they may choose to be invoiced.</p><p>You can follow the progress of your application 24 hours a day, seven days a week using UCAS&nbsp;<cite>Track</cite> by logging on to&nbsp;<a href="http://www.ucas.com" target="_blank"><u>www.ucas.com</u></a>.</p><p>If you experience any difficulties with Apply you should contact the UCAS Customer Services team:<br />T +44 (0)871 468 0468</p><p>If you would like further help from the University, please contact:</p><p>Central Admissions<br />T: 0161 295 4545<br /><a href="mailto:[email protected]"><u>[email protected]</u></a></p><p>All applications should be made via the UCAS website, except for:</p><ul><li>applications for part-time courses and BSc (Hons) Nursing Studies</li><li>applications for the International Foundation Year</li><li>applications for post qualifying Health and Social care courses and single modules</li><li>applications for International and the Graduate Certificate in Management (GCIM)</li></ul><p>If the above applies to you, please see the &lsquo;Alternative application process&rsquo; section on this page for more details of how to apply.</p>
</div>
      </div>
    </div>
</div><div class="panel panel-default">
<div class="panel-heading" role="tab" id="1421686_4">
      <h4 class="panel-title">
        <a data-toggle="collapse" data-parent="#accordion_1421686" href="#collapse_1421686_4" aria-expanded="false" aria-controls="collapse_sub_4" class="collapsed">
          UCAS applications: key information
        </a>
      </h4>
    </div>
    <div id="collapse_1421686_4" class="panel-collapse collapse" role="tabpanel" aria-labelledby="1421686_4">
      <div class="panel-body">
<div id="content_container_1136542">
<p>When applying, make sure you use the correct codes:</p><p>Salford institution code name:&nbsp;<strong>SALF</strong></p><p>Salford institution code:&nbsp;<strong>S03</strong></p><p>Campus code: there are no campus codes at Salford - leave this blank</p><p>We will start receiving applications for entry in autumn 2018 from early September 2017</p><p>The normal closing date for applications is&nbsp;<strong>15 January 2018</strong> (although if you are at school or college, they may ask you to fill it in earlier to give them time to prepare your reference). Applications received after this date may still be considered if the course is not full. If you apply late, please check the UCAS website for course availability first.</p><p><strong>UCAS Extra:</strong> 25 February &ndash; 4 July 2018</p><p><strong>Clearing starts:</strong> 5 July 2018</p><p><strong>A level results:</strong> 16 August 2018</p>
</div>
      </div>
    </div>
</div><div class="panel panel-default">
<div class="panel-heading" role="tab" id="1421686_5">
      <h4 class="panel-title">
        <a data-toggle="collapse" data-parent="#accordion_1421686" href="#collapse_1421686_5" aria-expanded="false" aria-controls="collapse_sub_5" class="collapsed">
          Alternative application process
        </a>
      </h4>
    </div>
    <div id="collapse_1421686_5" class="panel-collapse collapse" role="tabpanel" aria-labelledby="1421686_5">
      <div class="panel-body">
<div id="advice_and_guidance">
<h3>INTERNATIONAL FOUNDATION YEAR APPLICATIONS</h3><p>If you would like to apply for our International Foundation Year you will need to download and complete the application form.</p><p>You should then email it to&nbsp;<a href="mailto:[email protected]">[email protected]</a> or post the completed form to the address on the front cover of the form.</p><p>NOTE: Together with the form, you will also need to send us a photocopy of the identity page of your passport and proof of your current level of English.</p><ul type="disc"><li><a href="http://www.salford.ac.uk/__data/assets/pdf_file/0005/1190894/0518-International-foundation-year-salford-2017.pdf" target="_blank">International Foundation Year application form</a></li></ul><h3>POST QUALIFYING HEALTH AND SOCIAL CARE COURSES AND SINGLE MODULE APPLICATIONS</h3><p>If your Trust is part of the North West SHA-SLA agreement, contact your&nbsp;<strong>CPD Lead to agree funding</strong>. You must then apply through the NHS CPD Apply system: <a href="http://www.cpd-applynw.nhs.uk"><strong>www.cpd-applynw.nhs.uk</strong></a></p><p><strong> </strong></p><p>If you are self-funding or your workplace is funding your course or single module, please complete the appropriate application form below and send directly to the University. Full details on who to send the form to are included on the form. If workplace funded, you will also need to supply a letter&nbsp;&nbsp;from your Trust stating that they will be funding your study.</p><p><strong> </strong></p><h3>APPLICATION FORMS</h3><p><strong>LEVEL 5 AND LEVEL 6 (UNDERGRADUATE/SINGLE MODULE) APPLICATION FORM</strong></p><p><a href="http://www.salford.ac.uk/__data/assets/pdf_file/0008/974168/RU1095-UGApplicationFormNov2016v3.pdf"><strong>Download Undergraduate Application Form (please note you will need to provide two references)</strong></a></p><p><strong> </strong></p><p><strong>LEVEL 7 (MASTERS/SINGLE MODULE) APPLICATION FORM</strong></p><p><a href="http://www.salford.ac.uk/__data/assets/pdf_file/0007/79522/RU12483-PGApplicationNov2016.pdf"><strong>Download Postgraduate Application Form (please note you will need to provide two references)</strong></a></p><h3>INTERNATIONAL APPLICATIONS</h3><p>We welcome applications from international students. Applicants from outside the United Kingdom must apply through UCAS.</p><p>Students from the following countries should submit their applications to UCAS through the appropriate overseas office in London as listed in the UCAS Handbook: Cyprus, Guyana, India, Luxembourg, Thailand. Applicants from other countries should send their applications direct to UCAS. (Salford participates&nbsp;in the British Council Education Promotion Service and advisers are available in your local British Council Office).</p><p>Applications for the&nbsp;<strong>Graduate Certificate in Management</strong> should be made by completing the&nbsp;<a href="http://www.salford.ac.uk/__data/assets/pdf_file/0008/974168/RU1095-UGApplicationFormNov2016v3.pdf" target="_blank">undergraduate application form and returned to the address on the form</a>. For more information,&nbsp;see our section specifically for&nbsp;<a href="http://www.salford.ac.uk/international/how-to-apply">international students</a>.</p><h3>APPLYING FOR A TOP-UP COURSE</h3><p>If you are currently studying a Foundation Degree at one of our Partner Colleges and wish to top up your award to an Honours Degree, you should apply directly to the University by completing our&nbsp;<a href="http://www.salford.ac.uk/__data/assets/pdf_file/0008/974168/RU1095-UGApplicationFormNov2016v3.pdf">Undergraduate Studies direct application form</a>.</p><p>Information on the courses available to you to apply for are available from staff at your College.</p>
</div>
      </div>
    </div>
</div><div class="panel panel-default">
<div class="panel-heading" role="tab" id="1421686_6">
      <h4 class="panel-title">
        <a data-toggle="collapse" data-parent="#accordion_1421686" href="#collapse_1421686_6" aria-expanded="false" aria-controls="collapse_sub_6" class="collapsed">
          Writing your personal statement
        </a>
      </h4>
    </div>
    <div id="collapse_1421686_6" class="panel-collapse collapse" role="tabpanel" aria-labelledby="1421686_6">
      <div class="panel-body">
<div id="content_container_1136542">
<p>Writing your personal statement is a key part of the UCAS application process, but we know that it can sometimes feel like a daunting task. <br /><img width="1" height="15" alt="http://www.salford.ac.uk/__data/assets/image/0005/660326/spacer.gif" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAAPCAMAAAASwVXLAAAAAXNSR0ICQMB9xQAAAANQTFRFAAAAp3o92gAAAAF0Uk5TAEDm2GYAAAAJcEhZcwAADsQAAA7EAZUrDhsAAAAZdEVYdFNvZnR3YXJlAE1pY3Jvc29mdCBPZmZpY2V/7TVxAAAAC0lEQVQY02NgwAcAAB4AAcmYI4MAAAAASUVORK5CYII=" /><br /> Here are our top five tips for a winning personal statement&hellip;</p><h4>1. PREPARATION, PREPARATION, PREPARATION</h4><p>Start to prepare your personal statement early on. Without thinking about a structure, it will be harder for you to collect and organise your thoughts later on. Begin with a mind map, noting down anything you think might be relevant, such as your skills, qualities, likes, dislikes and experience. Formulate this into a plan, and use your plan to help structure your first draft.</p><h4>2. capture the attention of the reader</h4><p>Capture the attention of the reader with a sharp introduction; use positive language to show them you&rsquo;re enthusiastic about your chosen subject. The main section of your statement should demonstrate what you have learned from the experiences you are writing about, and how this is relevant to the course you want to study. Tell the reader about your skills and qualities, and how these contribute to your understanding of the subject.</p><h4>3. TELL THEM ABOUT YOURSELF</h4><p>Your work experience and interests are also important, as this can highlight why you&rsquo;re the perfect student for the course. Universities want to know that you&rsquo;ll become an active member of the student community, not just succeed academically. Are you part of a club, do you play any sports, or have an unusual hobby?&nbsp;&nbsp;Make sure you mention these here. Remember; entry requirements are transparent, you and your experiences are unique.</p><h4>4. SELL, SELL, SELL</h4><p>Conclude your personal statement by summarising your key strengths, and reiterate that you are ready (both socially and academically) for university life. Be confident, keep it positive, and really sell yourself.</p><h4>5. LOSE THE WAFFLE</h4><p>Remember that you have a limited number of words, so be clear and concise &ndash; don&rsquo;t waffle! Don&rsquo;t expect your first draft to be perfect; ask a teacher or careers advisor to check over it for you so that your spelling and grammar are spot on.<br /><img width="1" height="15" alt="http://www.salford.ac.uk/__data/assets/image/0005/660326/spacer.gif" src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAAPCAMAAAASwVXLAAAAAXNSR0ICQMB9xQAAAANQTFRFAAAAp3o92gAAAAF0Uk5TAEDm2GYAAAAJcEhZcwAADsQAAA7EAZUrDhsAAAAZdEVYdFNvZnR3YXJlAE1pY3Jvc29mdCBPZmZpY2V/7TVxAAAAC0lEQVQY02NgwAcAAB4AAcmYI4MAAAAASUVORK5CYII=" /></p>
</div>
      </div>
    </div>
</div><div class="panel panel-default">
<div class="panel-heading" role="tab" id="1421686_7">
      <h4 class="panel-title">
        <a data-toggle="collapse" data-parent="#accordion_1421686" href="#collapse_1421686_7" aria-expanded="false" aria-controls="collapse_sub_7" class="collapsed">
          Salford Alternative Entry Scheme
        </a>
      </h4>
    </div>
    <div id="collapse_1421686_7" class="panel-collapse collapse" role="tabpanel" aria-labelledby="1421686_7">
      <div class="panel-body">
<div id="content_container_1136542">
<p>At the University of Salford we believe that everyone should be able to achieve their full potential.</p><p>If you&rsquo;ve been out of education for a while, want to further your career or simply want to study in a field that you&rsquo;re passionate about, the Salford Alternative Entry Scheme (SAES) could be for you.</p><p><strong>How can I apply?</strong></p><ul><li>Submit your course application as you would normally through UCAS.</li><li>We will then recommend you to SAES if you&rsquo;re eligible (you&rsquo;ll be notified of this recommendation via UCAS or by our admissions team getting in touch with you).&nbsp;&nbsp;</li></ul><p>Successful applicants will be contacted through SAES about an assessment.&nbsp;</p><p>There are two different entry routes depending on your course. If the course you&rsquo;re studying is related to Business and Law, Health Sciences, Environment &amp; Life Sciences or Arts &amp; Media, you will be assessed through Accreditation for Prior Learning.&nbsp;If the course you&rsquo;re studying is related to Nursing, Midwifery, Social Work &amp; Social Sciences, Computing, Science &amp; Engineering or the Built Environment, you will be assessed through a MSAP-UK test.</p><p>For further information about the Salford Alternative Entry Scheme, please see our dedicated <a href="http://www.salford.ac.uk/study/undergraduate/salford-alternative-entry-scheme/entry-routes" target="_blank">SAES webpage</a>.</p>
</div>
      </div>
    </div>
</div><div class="panel panel-default">
<div class="panel-heading" role="tab" id="1421686_8">
      <h4 class="panel-title">
        <a data-toggle="collapse" data-parent="#accordion_1421686" href="#collapse_1421686_8" aria-expanded="false" aria-controls="collapse_sub_8" class="collapsed">
          Part-time and accelerated degree courses
        </a>
      </h4>
    </div>
    <div id="collapse_1421686_8" class="panel-collapse collapse" role="tabpanel" aria-labelledby="1421686_8">
      <div class="panel-body">
<div id="content_container_1136542">
<p>Part-time applications are made directly to the University. If you want to apply to study a part-time undergraduate qualification please contact our&nbsp;<strong>Course Enquiries Service</strong> on&nbsp;<strong>0161 295 4545</strong> for more information.</p><p>You&rsquo;ll need to complete an <a href="http://www.salford.ac.uk/__data/assets/pdf_file/0008/974168/RU1095-UGApplicationFormNov2016v3.pdf" target="_blank"><u>Undergraduate Studies direct application form</u></a>.</p><p>Please note you will need to provide two references.</p><p>If you&rsquo;re applying for one of our accelerated courses in the School of the Built Environment, these applications are made through<a href="http://www.salford.ac.uk/study/undergraduate/how-to-apply/#section1"> UCAS</a>. This applies to both the full-time and the day-release versions of the course, with durations of two or three years respectively. Only the standard part-time courses of five years&rsquo; duration require direct applications.</p>
</div>
      </div>
    </div>
</div><div class="panel panel-default">
<div class="panel-heading" role="tab" id="1421686_9">
      <h4 class="panel-title">
        <a data-toggle="collapse" data-parent="#accordion_1421686" href="#collapse_1421686_9" aria-expanded="false" aria-controls="collapse_sub_9" class="collapsed">
          Applying for 2019 Entry?
        </a>
      </h4>
    </div>
    <div id="collapse_1421686_9" class="panel-collapse collapse" role="tabpanel" aria-labelledby="1421686_9">
      <div class="panel-body">
<div id="content_container_1572153">
<p>If you&rsquo;re looking to apply for a course starting in 2019, you can start working on your application through UCAS from 22 May, for submission from 5 September onwards.</p><p>In the meantime, there are plenty of ways to get to know us better:</p><p>Visit us on our next Open Day on 23 June &ndash; book your place <a href="http://www.salford.ac.uk/study/visit/undergraduate-open-days" target="_blank">here</a></p><p>Follow us on social media for live updates:</p><p>Twitter.com/@SalfordUni</p><p>Snapchat: Salforduni</p><p>Instagram:@SalfordUni</p>
</div>
      </div>
    </div>
</div><div class="panel panel-default">
<div class="panel-heading" role="tab" id="1421686_10">
      <h4 class="panel-title">
        <a data-toggle="collapse" data-parent="#accordion_1421686" href="#collapse_1421686_10" aria-expanded="false" aria-controls="collapse_sub_10" class="collapsed">
          Higher and Degree Apprenticeship Programmes
        </a>
      </h4>
    </div>
    <div id="collapse_1421686_10" class="panel-collapse collapse" role="tabpanel" aria-labelledby="1421686_10">
      <div class="panel-body">
<div id="content_container_1572153">
<p>If you're looking to apply for a Higher or Degree Apprenticeship programme offered by the University of Salford then you will need to visit our apprenticeship webpage below and contact us via the online form. Applications for apprenticeship programmes do not go through the traditional UCAS system - you will apply directly to the University via a separate application form.&nbsp;</p><p><a href="https://www.salford.ac.uk/higher-and-degree-apprenticeships">www.salford.ac.uk/degree-apprenticeships</a></p><p>In order to apply for a Higher or Degree Apprenticeship you must be employed full-time in a relevant role and your employer must be willing to support you through the programme.</p><p>All apprenticeship roles are listed on the GOV.UK site below:&nbsp;</p><p><a href="https://www.gov.uk/apply-apprenticeship" target="_blank">www.gov.uk/apply-apprenticeship</a></p>
</div>
      </div>
    </div>
</div>
</div>"""]))
                # print("item['apply_proces_en']: ", item['apply_proces_en'])

                item['require_chinese_en'] = remove_class(clear_lianxu_space(["""<h2>Undergraduate</h2><p>Students who have completed a recognised foundation year or the first year of a relevant degree programme at a Chinese university may be considered</p>"""]))
                # print("item['require_chinese_en']: ", item['require_chinese_en'])

                duration = response.xpath(
                    "//strong[contains(text(), 'Duration')]/../following-sibling::*[position()<3]//text()").extract()
                clear_space(duration)
                print("duration: ", duration)
                # duration_str = ''.join(duration)

                if len(duration) == 1:
                    duration_list = getIntDuration(duration[0])
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                else:
                    item['duration'] = ', '.join(duration).replace("Fees:", "").strip().strip(',').strip()
                print("item['duration'] = ", item['duration'])
                print("item['duration_per'] = ", item['duration_per'])

                ucascode = response.xpath(
                    "//strong[contains(text(),'UCAS Code:')]/..//text()").extract()
                clear_space(ucascode)
                print("ucascode: ", ucascode)
                if len(ucascode) > 0:
                    # if len(ucascode[-1]) == 4:
                    #     item['ucascode'] = ucascode[-1]
                    # else:
                    item['ucascode'] = ucascode[-1]
                print("item['ucascode'] = ", item['ucascode'])
                yield item
        except Exception as e:
            with open("scrapySchool_England_Ben/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)