Python getTeachTime примеры использования

Язык программирования: Python

Пространство имен/Пакет: scrapySchool_England.getDuration

Метод/Функция: getTeachTime

Примеров на hotexamples.com: 27

Python getTeachTime - 27 примеров найдено. Это лучшие примеры Python кода для scrapySchool_England.getDuration.getTeachTime, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

Файл: NewcastleUniversity_P.py Проект: histudent/python_spider

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "Newcastle University"
        # item['country'] = 'England'
        # item['website'] = 'http://www.ncl.ac.uk/'
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        try:
            # 专业
            programmeDegree_type = response.xpath(
                "//div[@class='introTextArea']/h1//text()").extract()
            programmeDegree_type = ''.join(programmeDegree_type)
            print("programmeDegree_type: ", programmeDegree_type)
            # degree_typeList = re.findall(r"(\w+,\s\w+,\s\w+$)|(\w+,\s\w+$)|(\w+$)", programmeDegree_type)
            degree_typeList = re.findall(
                r"(Doctor\sof\s\([\w\s]+\)$)|(Master\sof\s\([\w\s]+\)$)|(Graduate\sDiploma)|(\(PGCE\))|(\w+,\s\w+,\s\w+$)|(\w+,\s\w+$)|(\w+$)",
                programmeDegree_type)
            # print("degree_typeList: ", degree_typeList)
            programme = programmeDegree_type
            if len(degree_typeList) != 0:
                degree_type = ''.join(list(degree_typeList[0]))
                item['degree_name'] = degree_type.replace("(", "").replace(
                    ")", "").strip()
                programme = programmeDegree_type.replace(
                    item['degree_name'], "")
            print("item['degree_name']: ", item['degree_name'])
            item['programme_en'] = ''.join(programme).strip().strip(
                ",").strip()
            print("item['programme_en']: ", item['programme_en'])

            # //html//div[@class='contentSeparator textEditorArea expandable']//p[1]/a
            department = response.xpath(
                "//html//div[@class='contentSeparator textEditorArea expandable']//p[1]/a//text()"
            ).extract()
            if len(department) == 0:
                department = response.xpath(
                    "//*[contains(text(), 'School of')]/text()|//*[contains(text(), 'Faculty of')]/text()"
                ).extract()
            # print(department)
            department_str = ';'.join(department).strip()
            # print(department_str)
            dep = re.findall(r"School\sof[a-zA-Z\s,]+|Faculty\sof[a-zA-Z\s,]+",
                             department_str)
            # print("dep: ", dep)
            if len(dep) > 0:
                for d in dep:
                    if "Faculty" in d:
                        item['department'] = d.replace("Graduate School",
                                                       "").strip()
                        # print("长度1： ", len(item['department']))
                        if len(item['department']) > 55:
                            continue
                        else:
                            break
                    else:
                        item['department'] = dep[0]
                        # print("长度： ", len(item['department']))
                        if len(item['department']) > 55:
                            item['department'] = dep[-1]
            # print("item['department']: ", item['department'])

            # 页面全部内容
            allcontent = response.xpath(
                "//main[@id='content']//article//text()").extract()
            # clear_space(allcontent)
            # print("allcontent：", allcontent)

            # overview
            if "Profile" in allcontent:
                overviewIndex = allcontent.index("Profile")
                if "Delivery" in allcontent:
                    overviewIndexEnd = allcontent.index("Delivery")
                    overview = allcontent[overviewIndex + 1:overviewIndexEnd]
                    clear_space(overview)
                    item['overview_en'] = clear_lianxu_space(overview)
                elif "Facilities" in allcontent:
                    overviewIndexEnd = allcontent.index("Facilities")
                    overview = allcontent[overviewIndex + 1:overviewIndexEnd]
                    clear_space(overview)
                    item['overview_en'] = clear_lianxu_space(overview)
                elif "Pathway" in allcontent:
                    overviewIndexEnd = allcontent.index("Pathway")
                    overview = allcontent[overviewIndex + 1:overviewIndexEnd]
                    clear_space(overview)
                    item['overview_en'] = clear_lianxu_space(overview)
            if len(item['overview_en']) != 0:
                item['overview_en'] = "<div>" + item['overview_en'] + "</div>"
            if item['overview_en'] == "":
                overview_r = response.xpath(
                    "//h2[contains(text(),'Profile')]|"
                    "//h2[contains(text(),'Profile')]/following-sibling::*[position()<20]|"
                    "//h2[contains(text(),'Profile')]/..|"
                    "//h2[contains(text(),'Profile')]/../following-sibling::*[position()<20]|"
                    "//p[@class='intro']").extract()
                item['career_en'] = remove_class(
                    clear_lianxu_space(overview_r))
            if item['overview_en'] == "":
                print("overview_en 为空")
            # print("item['overview_en']: ", item['overview_en'])

            # modules
            if "Modules" in allcontent:
                modulesIndex = allcontent.index("Modules")
                if "Explore Careers" in allcontent:
                    modulesIndexEnd = allcontent.index("Explore Careers")
                    modules = allcontent[modulesIndex + 1:modulesIndexEnd]
                    clear_space(modules)
                    item['modules_en'] = clear_lianxu_space(modules)
                elif "Fees & Funding" in allcontent:
                    modulesIndexEnd = allcontent.index("Fees & Funding")
                    modules = allcontent[modulesIndex + 1:modulesIndexEnd]
                    clear_space(modules)
                    item['modules_en'] = clear_lianxu_space(modules)
            if len(item['modules_en']) != 0:
                # print(item['modules_en'])
                # print("===", item['modules_en'].split('\n'))
                modules_tmp = ""
                for m in item['modules_en'].split('\n'):
                    modules_tmp += "<p>" + m + "</p>"
                item['modules_en'] = "<div>" + modules_tmp + "</div>"

            if item['modules_en'] == "":
                item['modules_en'] = None
                print("modules_en 为空")
            print("item['modules_en']: ", item['modules_en'])

            # career
            if "Careers" in allcontent:
                careerIndex = allcontent.index("Careers")
                if "Explore Fees & Funding" in allcontent:
                    careerIndexEnd = allcontent.index("Explore Fees & Funding")
                    career = allcontent[careerIndex + 2:careerIndexEnd]
                    clear_space(career)
                    item['career_en'] = clear_lianxu_space(career)
            elif "Training & Skills" in allcontent:
                careerIndex = allcontent.index("Training & Skills")
                if "Fees & Funding" in allcontent:
                    careerIndexEnd = allcontent.index("Fees & Funding")
                    career = allcontent[careerIndex + 2:careerIndexEnd]
                    clear_space(career)
                    item['career_en'] = clear_lianxu_space(career)
            if len(item['career_en']) != 0:
                item['career_en'] = "<div>" + item['career_en'] + "</div>"
            if item['career_en'] == "":
                career_r = response.xpath(
                    "//h2[contains(text(),'Careers')]/../preceding-sibling::*[1]/following-sibling::*[position()<9]|"
                    "//h3[contains(text(),'Careers')]|//h3[contains(text(),'Careers')]/following-sibling::*[position()<5]|"
                    "//h3[contains(text(),'Accreditation')]|//h3[contains(text(),'Accreditation')]/following-sibling::*[position()<5]|"
                    "//h3[contains(text(),'Your development')]/../..|"
                    "//span[contains(text(),'Your development')]/..|//span[contains(text(),'Your development')]/../following-sibling::*[position()<10]"
                ).extract()
                item['career_en'] = remove_class(clear_lianxu_space(career_r))
            if item['career_en'] == "":
                print("******career")
            # print("item['career_en']: ", item['career_en'])

            work_experience_desc_en = response.xpath(
                "//h3[contains(text(),'Work experience')]/preceding-sibling::*[1]/following-sibling::*[position()<5]"
            ).extract()
            item['work_experience_desc_en'] = remove_class(
                clear_lianxu_space(work_experience_desc_en))
            # print("item['work_experience_desc_en']: ", item['work_experience_desc_en'])

            # tuition_fee
            if "Fees & Funding" in allcontent:
                tuition_feeIndex = allcontent.index("Fees & Funding")
                if "Entry Requirements" in allcontent:
                    tuition_feeIndexEnd = allcontent.index(
                        "Entry Requirements")
                    tuition_fee = allcontent[tuition_feeIndex +
                                             2:tuition_feeIndexEnd]
                    clear_space(tuition_fee)
                    maxfee = getTuition_fee(''.join(tuition_fee))
                    # print("maxfee: ========", maxfee)
                    item['tuition_fee'] = maxfee
                    item['tuition_fee_pre'] = "£"
            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
            # print("item['tuition_fee']: ", item['tuition_fee'])

            # 学术要求
            if "Entry Requirements" in allcontent:
                entry_requirementsIndex = allcontent.index(
                    "Entry Requirements")
                if "How to Apply" in allcontent:
                    entry_requirementsIndexEnd = allcontent.index(
                        "How to Apply")
                    entry_requirements = allcontent[
                        entry_requirementsIndex + 1:entry_requirementsIndexEnd]
                    clear_space(entry_requirements)
                    item['rntry_requirements'] = clear_lianxu_space(
                        entry_requirements).replace("Find out How to Apply",
                                                    "").strip()
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            # how_to_apply
            if "How to Apply" in allcontent:
                how_to_applyIndex = allcontent.index("How to Apply")
                how_to_apply = allcontent[how_to_applyIndex + 1:]
                clear_space(how_to_apply)
                item['apply_proces_en'] = clear_lianxu_space(how_to_apply)
            if item['apply_proces_en'] != "":
                item['apply_proces_en'] = "<div>" + item[
                    'apply_proces_en'] + "</div>"
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            # if "Application fee" in allcontent:
            #     application_feeIndex = allcontent.index("Application fee")
            #     application_fee = allcontent[application_feeIndex:]
            #     clear_space(application_fee)
            #     item['apply_fee'] = clear_lianxu_space(application_fee)

            apply_fee_re = re.findall(r"application\sfee\sof\s£\d+",
                                      item['apply_proces_en'])
            # print(apply_fee_re)
            if len(apply_fee_re) > 0:
                item['apply_fee'] = int(''.join(apply_fee_re).replace(
                    "application fee of", "").replace("£", "").strip())
                item['apply_pre'] = "£"
            # print("item['apply_fee']: ", item['apply_fee'])
            # print("item['apply_pre']: ", item['apply_pre'])

            # # School\sof[\w\s]*             Newcastle\sUniversity\sBusiness\sSchool
            # department = re.findall(r"(School\sof[\w\s]{1,20})|(Newcastle\sUniversity\sBusiness\sSchool)", ''.join(allcontent))
            department = re.findall(r"Newcastle\sUniversity\sBusiness\sSchool",
                                    ''.join(allcontent))
            # print("department: ", department)
            if len(department) > 0 and item['department'] == "":
                item['department'] = department[0]
            # print("item['department']: ", item['department'])

            # IELTS
            # print("programmeDegree_type: ", programmeDegree_type)
            prt = programmeDegree_type.replace(" ", "%20")
            # print("prt: ", prt)
            ieltsToeflUrl = "http://includes.ncl.ac.uk/cmswebservices/pg/languageRequirements/ws.php?title=" + prt
            # print("ieltsToeflUrl: ", ieltsToeflUrl)
            # 获得item['ielts_desc']和item['toefl_desc']
            self.parse_ieltsToefl(ieltsToeflUrl, item)

            if item['ielts_desc'] == "":
                ielt_dd = response.xpath(
                    "//*[contains(text(),'IELTS')]/..//text()").extract()
                clear_space(ielt_dd)
                # print(ielt_dd)
                # if len(ielt_dd) > 0:
                item['ielts_desc'] = ''.join(ielt_dd).replace("65", "").strip()
                # print("*item['ielts_desc']*=: ", item['ielts_desc'])
                if item['ielts_desc'] != "":
                    item['ielts_desc'] = ''.join(
                        re.findall(r".{1,90}IELTS.{1,90}",
                                   item['ielts_desc'])).strip()
            # print("*item['ielts_desc']: ", item['ielts_desc'])
            # print("*item['toefl_desc']: ", item['toefl_desc'])

            if item['ielts_desc'] != "":
                ielts_list = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
                # print(ielts_list)
                if len(ielts_list) == 1:
                    item['ielts'] = ielts_list[0]
                    item['ielts_l'] = ielts_list[0]
                    item['ielts_s'] = ielts_list[0]
                    item['ielts_r'] = ielts_list[0]
                    item['ielts_w'] = ielts_list[0]
                elif len(ielts_list) == 2:
                    item['ielts'] = ielts_list[0]
                    item['ielts_l'] = ielts_list[1]
                    item['ielts_s'] = ielts_list[1]
                    item['ielts_r'] = ielts_list[1]
                    item['ielts_w'] = ielts_list[1]
                elif len(ielts_list) == 3:
                    item['ielts'] = ielts_list[0]
                    item['ielts_l'] = ielts_list[2]
                    item['ielts_s'] = ielts_list[2]
                    item['ielts_r'] = ielts_list[2]
                    item['ielts_w'] = ielts_list[1]
                elif len(ielts_list) > 3:
                    item['ielts'] = ielts_list[0]
                    item['ielts_l'] = ielts_list[1]
                    item['ielts_s'] = ielts_list[1]
                    item['ielts_r'] = ielts_list[1]
                    item['ielts_w'] = ielts_list[1]
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            if item['toefl_desc'] != "":
                toefl_list = re.findall(r"\d\d+", item['toefl_desc'])
                if len(toefl_list) == 1:
                    item['toefl'] = toefl_list[0]
                    item['toefl_l'] = toefl_list[0]
                    item['toefl_r'] = toefl_list[0]
                    item['toefl_s'] = toefl_list[0]
                    item['toefl_w'] = toefl_list[0]
                elif len(toefl_list) == 2:
                    item['toefl'] = toefl_list[0]
                    item['toefl_l'] = toefl_list[1]
                    item['toefl_r'] = toefl_list[1]
                    item['toefl_s'] = toefl_list[1]
                    item['toefl_w'] = toefl_list[1]
                elif len(toefl_list) == 4:
                    item['toefl'] = toefl_list[0]
                    item['toefl_l'] = toefl_list[3]
                    item['toefl_r'] = toefl_list[1]
                    item['toefl_s'] = toefl_list[2]
                    item['toefl_w'] = toefl_list[1]
                elif len(toefl_list) == 5:
                    item['toefl'] = toefl_list[0]
                    item['toefl_l'] = toefl_list[1]
                    item['toefl_r'] = toefl_list[2]
                    item['toefl_s'] = toefl_list[3]
                    item['toefl_w'] = toefl_list[4]
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #                             item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))
            if item['toefl_w'] == item['toefl_r'] and item['toefl_w'] != item[
                    'toefl_l'] and item['toefl_w'] != item['toefl_s']:
                item['ielts_r'] = item['ielts_w']
            # print("====","item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            # chinese_requirements
            item[
                'require_chinese_en'] = """<h3>Postgraduate entry requirements (excluding Newcastle University Business School)</h3><p>Typically we recognise:</p><ul><li>75% in a four Year Bachelor degree from a Tier 1 University or 80% from a Tier 2 University as comparable to a <em>2.1</em> and 70% from a Tier 1 University or 75% from a Tier 2 University as comparable to a <em>2.2</em></li><li>80% in an Adult degree or E-learning as comparable to a<em> 2.1</em> and 75% as equivalent to <em>2.2</em></li></ul><p>You may be considered for <em>entry to masters study with a three year university diploma</em> and more than <em>five years of relevant work experience</em>.</p>"""
            if item['department'] == "Newcastle University Business School":
                item[
                    'require_chinese_en'] = """<h3>Newcastle University Business School postgraduate entry requirements</h3><p>Typically we recognise 75% in a four year bachelor degree from a Tier 1 university or 82% from a Tier 2 university as comparable to a<em> 2.1</em> in the following courses:</p><ul><li>Banking and Finance MSc</li><li>Finance MSc</li><li>International Economics and Finance MSc</li><li>Quantitative Finance and Risk Management MSc</li></ul><p>For all other Newcastle University Business School postgraduate courses typically we recognise:</p><ul><li>80% in a four year Bachelor degree from a Tier 1 University as comparable to a <em>2.1</em></li><li>or 85% from a Tier 2 University as comparable to a<em> 2.1</em></li></ul><p>If you <em>do not meet these requirements</em> you may be eligible to apply for entry to a <em>course at INTO Newcastle University</em>. These courses are based on our campus and help you to prepare for study with us.</p>"""
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            item['location'] = "Newcastle University, NE1 7RU, United Kingdom"

            # duration
            duration = response.xpath(
                "//div[@class='introTextArea']/p/text()").extract()
            duration_str = '\n'.join(duration).strip()
            # print("duration_str: ", duration_str)
            item['teach_time'] = getTeachTime(duration_str)
            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])
            # print("item['teach_time']: ", item['teach_time'])
            # 以phd类型存入数据库
            if item['degree_name'].lower() == "phd":
                item['degree_type'] = 3
                item['teach_type'] = 'phd'
                yield item
                # 既有phd类型，又有硕士taught类型，要存两条
            elif "phd" in item['degree_name'].lower(
            ) and item['degree_name'].lower() != "phd":
                yield item
                duration_phd_re = re.findall(r"PhD.*", duration_str, re.I)
                # print("duration_phd_re: ", duration_phd_re, "---", item['degree_name'], "---", item['programme_en'])
                duration_phd_list = getIntDuration(''.join(duration_phd_re))
                if len(duration_phd_list) == 2:
                    item['duration'] = duration_phd_list[0]
                    item['duration_per'] = duration_phd_list[-1]
                # print("phd item['duration']: ", item['duration'])
                # print("phd item['duration_per']: ", item['duration_per'])
                # print("phd item['teach_time']: ", item['teach_time'])
                item['degree_type'] = 3
                item['teach_type'] = 'phd'
                yield item
            else:
                item['degree_type'] = 2
                item['teach_type'] = 'taught'
                yield item
            # elif "m" in item['degree_name'].lower():
            #     item['degree_type'] = 2
            #     item['teach_type'] = 'taught'
            # print("item['degree_type']: ", item['degree_type'])
            # print("item['teach_type']: ", item['teach_type'])

            # yield item
        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #2

Показать файл

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.bathspa.ac.uk/"
        item['university'] = "Bath Spa University"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        try:
            item['location'] = 'Bath'
            # 专业、学位类型//div[@class='masthead-inner']/div/div[@class='masthead-content']/h1
            programme = response.xpath(
                "//div[@class='masthead-inner']/div/div[@class='masthead-content']/h1//text()"
            ).extract()
            item['programme_en'] = ''.join(programme)
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath(
                "//div[@class='masthead-inner']/div/div[@class='masthead-content']/p[1]//text()"
            ).extract()
            item['degree_name'] = ''.join(degree_type)
            # print("item['degree_name']: ", item['degree_name'])
            if item['degree_name'] == "" and "phd" in item[
                    'programme_en'].lower(
                    ) or item['degree_name'] == "" and "doctorate" in item[
                        'programme_en'].lower():
                item['degree_name'] = 'phd'
                item['teach_type'] = 'phd'
                # 学位类型
                item['degree_type'] = 3
            # print("item['teach_type']: ", item['teach_type'])
            # print("item['degree_type']: ", item['degree_type'])
            print("item['degree_name']: ", item['degree_name'])

            # //div[@class='content']/div[@class='collapsible-content'][1]/div[2]/div[1]
            overview = response.xpath(
                "//h3[contains(text(),'Overview')]/..").extract()
            if len(overview) == 0:
                overview = response.xpath(
                    "//h3[contains(text(),'overview')]/..").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            modules = response.xpath(
                "//h3[contains(text(),'Course structure')]/..|//h3[contains(text(),'Course modules')]/.."
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath(
                "//h3[contains(text(),'How will I be assessed?')]/..|//h3[contains(text(),'How will I be taught?')]/..|//h3[contains(text(),'Assessment')]/.."
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            career = response.xpath(
                "//h3[contains(text(),'Career')]/..|//h3[contains(text(),'career')]/.."
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            feeContent = response.xpath(
                "//h3[contains(text(),'International students full time')]/../div/table[1]//td[contains(text(), '2018/19 entry')]/following-sibling::td//text()"
            ).extract()
            clear_space(feeContent)
            # print(feeContent)
            if len(feeContent) > 0:
                item['tuition_fee'] = int(feeContent[0].replace(
                    "£", "").replace(",", "").strip())
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //div[@class='content']/div[@class='collapsible-content highlighted']/div[2]/div[2]
            entry_requirements = response.xpath(
                "//div[@class='content']/div[@class='collapsible-content highlighted']//text()"
            ).extract()
            clear_space(entry_requirements)
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ieltsList = re.findall(r".{1,60}IELTS.{1,60}",
                                   item['rntry_requirements'])
            item['ielts_desc'] = ''.join(ieltsList)
            print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            print(
                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            interview_desc_en = response.xpath(
                "//h3[contains(text(),'Interview and portfolio guidance')]/.."
            ).extract()
            item['interview_desc_en'] = remove_class(
                clear_lianxu_space(interview_desc_en))
            # print("item['interview_desc_en']: ", item['interview_desc_en'])

            # https://www.bathspa.ac.uk/international/country-advice/china/
            item[
                'require_chinese_en'] = "<p><strong>Postgraduate</strong></p><ul><li>Normally a Bachelor's degree with honours and a good passing grade from an internationally recognised university or Higher Education institution</li><li>Other international qualifications to an equivalent standard will also be considered.</li></ul> "

            # https://www.bathspa.ac.uk/applicants/how-to-apply/postgraduate/
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="content">
      <div data-hash-anchor='<a id="d.en.1289"></a>'></div>
<div class="intro-text">
	<p class="intro">You can apply for one of our taught postgraduate courses online from the webpage for the course you're interested in.</p>
</div><div class="rich-text" >
  <div data-hash-anchor='<a id="d.en.1291"></a>'></div>
    <div>
        <h2>How to apply</h2>
<p>To apply simply hit on the "Apply Now" on the course’s webpage. You'll need to create an online account.</p>
<p>Don’t have time to complete your whole application? Don’t worry, you can save your application and come back to it at anytime.</p>
<p>Entry requirements are listed on the course's webpage. If you don’t hold a first degree you may be required to provide additional evidence to support your application.</p>
<p><a href="/courses/">Search for your course</a></p>
<h3>What do I need?</h3>
<p>As part of the online application you’ll need to upload a variety of documents. This may include:</p>
<ul>
<li>Copy of passport</li>
<li>Qualifications</li>
<li>Portfolio</li>
<li>Previous UK visas (if applicable)</li>
<li>Reference.</li>
</ul>
<h3>Contact us</h3>
<p>Please contact us if you have any questions or concerns:&nbsp;<a href="mailto:[email protected]">[email protected]</a></p>
<p>Phone: +44 (0)1225 876180</p>
<h3>Interviews</h3>
<p>You may be required to attend an interview as part of the selection process for a postgraduate course. This is usually a 30 minute discussion of your experience and any work submitted with the application.</p>
<p>Telephone or Skype interviews can usually be arranged for applicants applying from outside of the UK.</p>
    </div>
</div>
"""
                ]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            #             department_dict = {"arts management":"Bath Business School","accounting and finance":"Bath Business School",
            # "business and management":"Bath Business School",
            # "business and management (accounting)":"Bath Business School",
            # "business and management (entrepreneurship)":"Bath Business School",
            # "business and management (international business)":"Bath Business School",
            # "business and management (marketing)":"Bath Business School",
            # "curatorial practice":"Bath School of Art and Design",
            # "design (ceramics)":"Bath School of Art and Design",
            # "design (fashion and textiles)":"Bath School of Art and Design",
            # "fine art":"Bath School of Art and Design",
            # "visual communication":"Bath School of Art and Design",
            # "children's publishing":"College of Liberal Arts",
            # "classical acting":"College of Liberal Arts",
            # "composition":"College of Liberal Arts",
            # "creative producing":"College of Liberal Arts",
            # "creative writing":"College of Liberal Arts",
            # "creative writing phd":"College of Liberal Arts",
            # "crime and gothic fictions":"College of Liberal Arts",
            # "dance":"College of Liberal Arts",
            # "directing":"College of Liberal Arts",
            # "directing circus":"College of Liberal Arts",
            # "environmental humanities":"College of Liberal Arts",
            # "environmental management":"College of Liberal Arts",
            # "feature filmmaking":"College of Liberal Arts",
            # "heritage management":"College of Liberal Arts",
            # "intercultural musicology":"College of Liberal Arts",
            # "liberal arts":"College of Liberal Arts",
            # "literature, landscape and environment":"College of Liberal Arts",
            # "music performance":"College of Liberal Arts",
            # "performing shakespeare":"College of Liberal Arts",
            # "principles of applied neuropsychology":"College of Liberal Arts",
            # "scriptwriting":"College of Liberal Arts",
            # "songwriting (campus based)":"College of Liberal Arts",
            # "songwriting (distance learning)":"College of Liberal Arts",
            # "sound (arts)":"College of Liberal Arts",
            # "sound (design)":"College of Liberal Arts",
            # "sound (production)":"College of Liberal Arts",
            # "theatre for young audiences":"College of Liberal Arts",
            # "transnational writing":"College of Liberal Arts",
            # "travel and nature writing":"College of Liberal Arts",
            # "writing for young people":"College of Liberal Arts",
            # "counselling and psychotherapy practice":"Institute for Education",
            # "education (education studies)":"Institute for Education",
            # "education (early childhood studies)":"Institute for Education",
            # "education (international education)":"Institute for Education",
            # "education (leadership and management)":"Institute for Education",
            # "inclusive education":"Institute for Education",
            # "professional practice":"Institute for Education",
            # "professional practice in higher education":"Institute for Education",
            # "teaching english to speakers of other languages":"Institute for Education",
            # "specific learning difficulties / dyslexia":"Institute for Education",
            # "national award for special educational needs coordination":"Institute for Education",
            # "professional doctorate in education":"Institute for Education",
            # }
            #             item['department'] = department_dict.get(item['programme_en'].lower())
            #             print("item['department']: ", item['department'])
            department = response.xpath(
                "//dt[contains(text(),'School')]/following-sibling::dd[1]//text()"
            ).extract()
            item['department'] = ''.join(department).strip()
            print("item['department']: ", item['department'])

            location = response.xpath(
                "//dt[contains(text(),'Campus or location')]/following-sibling::dd[1]//text()"
            ).extract()
            item['location'] = ''.join(location).strip()
            print("item['location']: ", item['location'])

            # duration
            durationMode = response.xpath(
                "//dt[contains(text(),'Course length')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(durationMode)
            print("durationMode: ", durationMode)
            durationMode = ''.join(durationMode)
            duration_list = getIntDuration(''.join(durationMode))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            item['teach_time'] = getTeachTime(''.join(durationMode))
            print("item['duration']: ", item['duration'])
            print("item['teach_time']: ", item['teach_time'])
            print("item['duration_per']: ", item['duration_per'])
            item['other'] = durationMode
            yield item
        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #3

Показать файл

Файл: SouthamptonSolentUniversity_P.py Проект: histudent/python_spider

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "Southampton Solent University"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        try:
            degree_name = response.xpath(
                "//div[@class='row column']/h1/abbr/text()").extract()
            item['degree_name'] = ''.join(degree_name).strip()
            print("item['degree_name']: ", item['degree_name'])

            programme = response.xpath(
                "//div[@class='row column']/h1/text()").extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            # start_date = response.xpath("//dt[contains(text(), 'Start date')]/following-sibling::dd[1]//text()").extract()
            # clear_space(start_date)
            # # print("start_date: ", start_date)
            # item['start_date'] = getStartDate(''.join(start_date))
            # # print("item['start_date']: ", item['start_date'])

            duration = response.xpath(
                "//div[@class='banner__stats']//text()").extract()
            clear_space(duration)
            # print("duration: ", ' '.join(duration))
            duration_list = getIntDuration(' '.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            overview_en = response.xpath(
                "//section[@class='intro intro--courses section']").extract()
            item['overview_en'] = remove_class(
                clear_lianxu_space(overview_en)).replace(
                    "Book an open day", "")
            # print("item['overview_en']: ", item['overview_en'])

            tuition_fee = response.xpath(
                "//html//div[@class='facts-figures__panel panel']/p[contains(text(),'The tuition fees for the 2018/19 academic year are:')]//text()|"
                "//html//div[@class='facts-figures__panel panel']/p[contains(text(),'The tuition fees for the 2018/19 academic year are:')]/following-sibling::p[1]//text()"
            ).extract()
            # if len(tuition_fee) == 0:
            #     tuition_fee = response.xpath(
            #         "//html//div[@class='facts-figures__panel panel']/p[contains(text(),'The tuition fees for the 2018/19 academic year are:')]/following-sibling::p[1]//text()").extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", ''.join(tuition_fee))
            tuition_fee_re = re.findall(
                r"International\sfull-time\sfees:£\d+,\d+|Internationalfull-timefees:£\d+,\d+|Internationaltotal\scoursefees:£\d+,\d+",
                ''.join(tuition_fee))
            # print("tuition_fee_re: ", tuition_fee_re)
            item['teach_time'] = getTeachTime(''.join(tuition_fee_re))
            # print("item['teach_time']: ", item['teach_time'])

            tuition_fee_re1 = re.findall(r"\d+,\d+", ''.join(tuition_fee_re))
            if len(tuition_fee_re1) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee_re1))
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            entry_requirements = response.xpath(
                "//h3[@class='facts-figures__header'][contains(text(),'Key entry requirements')]/../..//text()"
            ).extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts_desc = response.xpath(
                "//h3[contains(text(),'English language requirements')]/..//*[contains(text(), 'IELTS')]//text()"
            ).extract()
            clear_space(ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            item['ielts_desc'] = ''.join(ielts_desc)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_list = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
            if len(ielts_list) == 1:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[0]
                item['ielts_s'] = ielts_list[0]
                item['ielts_r'] = ielts_list[0]
                item['ielts_w'] = ielts_list[0]
            elif len(ielts_list) == 2:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[1]
                item['ielts_s'] = ielts_list[1]
                item['ielts_r'] = ielts_list[1]
                item['ielts_w'] = ielts_list[1]
            elif len(ielts_list) == 3:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[2]
                item['ielts_s'] = ielts_list[2]
                item['ielts_r'] = ielts_list[2]
                item['ielts_w'] = ielts_list[1]
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            toefl_desc = response.xpath(
                "//h3[contains(text(),'English language requirements')]/..//*[contains(text(), 'TOEFL')]//text()"
            ).extract()
            clear_space(toefl_desc)
            # print("ielts_desc: ", ielts_desc)
            item['toefl_desc'] = ''.join(toefl_desc)
            # print("item['toefl_desc']: ", item['toefl_desc'])

            toefl_list = re.findall(r"\d\d+", item['toefl_desc'])
            if len(toefl_list) == 1:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[0]
                item['toefl_r'] = toefl_list[0]
                item['toefl_s'] = toefl_list[0]
                item['toefl_w'] = toefl_list[0]
            elif len(toefl_list) == 2:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[1]
                item['toefl_r'] = toefl_list[1]
                item['toefl_s'] = toefl_list[1]
                item['toefl_w'] = toefl_list[1]
            elif len(toefl_list) == 3:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[2]
                item['toefl_r'] = toefl_list[2]
                item['toefl_s'] = toefl_list[2]
                item['toefl_w'] = toefl_list[1]
            elif len(toefl_list) == 5:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[1]
                item['toefl_r'] = toefl_list[2]
                item['toefl_s'] = toefl_list[3]
                item['toefl_w'] = toefl_list[4]
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #        item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            assessment_en = response.xpath(
                "//*[contains(text(),'Teaching')]/..|//*[contains(text(),'Assessment')]/.."
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            if item['assessment_en'] == "":
                print("***")
            print("item['assessment_en']: ", item['assessment_en'])

            work_experience_desc_en = response.xpath(
                "//*[contains(text(),'Work experience')]/..").extract()
            item['work_experience_desc_en'] = remove_class(
                clear_lianxu_space(work_experience_desc_en))
            # print("item['work_experience_desc_en']: ", item['work_experience_desc_en'])

            how_to_apply = response.xpath(
                "//h3[@class='subheader']/..").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            location = response.xpath(
                "//h4[contains(text(),'Study location')]/following-sibling::*[1]//text()"
            ).extract()
            item['location'] = ''.join(location).strip()
            # print("item['location']: ", item['location'])

            modules = response.xpath(
                "//a[contains(text(),'Programme specification document')]/../../preceding-sibling::*"
            ).extract()
            if len(modules) == 0:
                # //h2[contains(text(),'Support')]/../../preceding-sibling::*
                modules = response.xpath(
                    "//h2[contains(text(),'Support')]/../../preceding-sibling::*"
                ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # if item['modules_en'] == "":
            #     print("***")
            # print("item['modules_en']: ", item['modules_en'])

            career_en = response.xpath(
                "//h2[@class='header'][contains(text(),'Industry links')]/.."
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en']: ", item['career_en'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h2>Entry requirements</h2>
<p>As a general guide, we look for qualifications that are equivalent to the British high school A-levels. A portfolio is also required for most of our art and design courses.</p>
<p>Students with a good Senior High School Diploma and an IELTS of minimum 5.5 may be eligible for a foundation year (level 0 of a bachelor's degree) or an HND programme.</p>
<p>For postgraduate courses, we look for qualifications that are equivalent to the British&nbsp;bachelor's degree.</p>
"""
                ]))
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            # department = response.xpath("//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()").extract()
            # item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #4

Показать файл

Файл: UniversityOfChester_P.py Проект: histudent/python_spider

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "University of Chester"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        try:
            programme = response.xpath("//h1[@id='main-content']/text()").extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath("//h1[@id='main-content']/div//text()").extract()
            item['degree_name'] = ''.join(degree_type).strip()
            print("item['degree_name']: ", item['degree_name'])

            if "doctor of" in item['programme_en'].lower() or item['degree_name'].lower() == "mres":
                item['teach_type'] = 'phd'
                item['degree_type'] = 3
            print("item['teach_type']: ", item['teach_type'])
            print("item['degree_type']: ", item['degree_type'])

            start_date = response.xpath("//span[@class='m-facts__fact']//text()|"
                                        "//select[@id='edit-date']//option[@selected='selected']//text()").extract()
            clear_space(start_date)
            print("start_date: ", start_date)
            item['start_date'] = getStartDate(''.join(start_date))
            print("item['start_date']: ", item['start_date'])

            mode = response.xpath("//select[@id='edit-mode']//text()").extract()
            clear_space(mode)
            item['teach_time'] = getTeachTime(''.join(mode))
            print("item['teach_time']: ", item['teach_time'])

            location = response.xpath("//label[@for='edit-compulsory']/following-sibling::*//text()").extract()
            item['location'] = ''.join(location).strip()
            print("item['location']: ", item['location'])

            duration = response.xpath("//dt[@class='m-facts__label']//following-sibling::*//text()").extract()
            clear_space(duration)
            print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            print("item['duration'] = ", item['duration'])
            print("item['duration_per'] = ", item['duration_per'])

            overview_en = response.xpath(
                "//h3[@class='field-label'][contains(text(),'Course overview')]/../*[position()<last()]|"
                "//div[@class='m-body__margin-bottom t-course__overview']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # print("item['overview_en']: ", item['overview_en'])

            entry_requirements = response.xpath("//div[@id='entry-international']//form[@id='courses-international-form']/preceding-sibling::*//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts_desc = response.xpath("//div[@id='entry-international']//li[contains(text(),'Postgraduate:')]//text()").extract()
            clear_space(ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            item['ielts_desc'] = ''.join(ielts_desc)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            assessment_en = response.xpath("//h3[@class='field-label'][contains(text(),'How will I be taught?')]/..|"
                                           "//h3[@class='field-label'][contains(text(),'How will I be assessed?')]/..").extract()
            item['assessment_en'] = remove_class(clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            tuition_fee = response.xpath("//div[@class='field-fees-international']/p//text()").extract()
            print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            career_en = response.xpath("//div[@id='careers-job-prospects']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en)).replace("<div></div>", "").strip()
            # print("item['career_en']: ", item['career_en'])

            modules = re.findall(r"function\sinit_drupal_core_settings\(\)\s{jQuery\.extend\(Drupal\.settings,.*}", response.text)
            # print("modules: ", modules)
            modules_str = ''.join(modules).replace("function init_drupal_core_settings() {jQuery.extend(Drupal.settings,", "").strip()
            modules_dict = json.loads(modules_str)
            print("modules_dict: ", modules_dict)
            # groupCode     modulesNid
            print(modules_dict.get("courses"))
            # if modules_dict.get('courses').get('groupCode') is not False:
            if modules_dict.get('courses').get('groupCode') is not None:
                modules_json = "https://www1.chester.ac.uk/courses/modules/ajax/"+modules_dict.get('courses').get('modulesNid')+"/"+modules_dict.get('courses').get('groupCode')+"/389"
                # print("modules_json: ", modules_json)
                mdict = json.loads(requests.get(modules_json).text)
                # print("mdict: ", len(mdict))
                m = mdict[-1].get('data')
                if m != None:
                    item['modules_en'] = remove_class(clear_lianxu_space([m]))
            print("item['modules_en']: ", item['modules_en'])

            item['apply_proces_en'] = remove_class(clear_lianxu_space(["""<div class="content">
    
  <h2>Before You Apply</h2>
<p>Please read the relevant course information carefully. If you would like to know more about a programme or research area, we suggest that you contact the programme leader or centre director, in writing, by telephone or by&nbsp;<a href="mailto:[email protected]">email</a>&nbsp;via Postgraduate Admissions. They will be able to answer your questions in more detail and send you further information.</p>
<h2>How to Apply</h2>
<p>If you are a Home/EU student applying for a postgraduate taught course, you should apply directly via the online application system (AIMS) via the link below.&nbsp; If you are an International student applying for a postgraduate taught course, you should apply via the <a href="http://www1.chester.ac.uk/study/postgraduate/how-apply/applying-taught-courses-international-applicants">International Centre</a>. If you are applying for a PGCE Primary, Secondary or Early Years programmes, please note there is a separate admissions&nbsp;procedure.&nbsp; Please email <a href="mailto:[email protected]">[email protected]</a> and we will forward your details on to PGCE Admissions. All&nbsp;<a href="http://www.chester.ac.uk/research/degrees/application">research degree</a>&nbsp;applicants,&nbsp;whether Home, EU or International, should visit the relevant web pages or follow the links on the right hand side of this page.</p>
<p>Paper application forms are no longer issued, except in cases where an online application would impossible for the candidate. Please complete the relevant online application on our website.&nbsp;</p>
<p>&nbsp;Once you have submitted your application, the system will automatically contact your referees on your behalf. Your application may not be considered without two appropriate references and all additional documents required with your application, which include:&nbsp;</p>
<ul>
<li>Copies of certificates/transcripts</li>
<li>Copy of English language proficiency certificate (if required).&nbsp;<strong>Applicants whose first language is not English must provide evidence of proficiency to IELTS 6.5 with no less than 5.5 in each band or equivalent.</strong></li>
<li>Full curriculum vitae (if required)</li>
<li>You may also be asked to complete a fees assessment in order to determine the level of tuition fee payable.</li>
</ul>
<p>Specific programmes require additional documents to be submitted with your application, e.g. Nutrition and Dietetics, Fine Art.</p>
<p>Before doing so, please ensure that you inform your referees. In most cases the references shall come from independent academic referees, i.e. they are not normally provided by the programme leader of the course you are applying for. Once your application is submitted, we will then forward it to the relevant programme leader for consideration. If your application is successful, an offer of a place will be made in writing by Postgraduate Admissions. This will either be unconditional or conditional, depending on the completeness of your application.</p>
<h2>Entry Requirements</h2>
<p>Usually, postgraduate applicants should have an appropriate first degree, with a minimum of second class honours or equivalent. However, if you do not have appropriate academic qualifications, you may be admitted by virtue of prior work experience or by demonstrating relevant knowledge and skills in a specific field. If you are unsure whether your qualifications are acceptable for admission to your chosen programme of study, contact the programme leader or Postgraduate Admissions for further advice.&nbsp;</p>
<p>If your qualifications or experience are not suitable, we will be able to advise you about further options that might bring you up to the required level necessary to enter the course of your choice.</p>
<p>Each course has its own entry requirements, which are shown on each individual course web page under the 'Entry requirements' tab.</p>
<p>For entry requirements relating to our PGCE<a href="/postgraduate/pgce-in-education-primary" title="PG Primary">&nbsp;Primary</a>,&nbsp;<a href="/postgraduate/pgce-secondary-programme" title="PG Secondary">Secondary&nbsp;</a>and&nbsp;<a href="/postgraduate/pgce-early-years" title="PG Early Years">Early Years</a>&nbsp;courses please refer to the relevant pages.</p>
<p>If you have any queries concerning the applications process please contact us at:</p>
<h4>T: 01244 512456/512474<br />
E:&nbsp;<a href="mailto:[email protected]" title="Postgraduate Enquiries">[email protected]</a></h4>
<p>&nbsp;</p>
<h2>Accreditation of Prior Learning (APL/APEL)</h2>
<p>To be admitted to a postgraduate course, evidence of your prior learning should be equal to higher education Level 3, now referred to as level 6, which is the final year of an undergraduate degree course, or other equivalent, e.g. related professional qualifications. A subject tutor will help you to determine how much of your prior learning can be credited against the course. This may not have been undertaken in an educational environment, but its value may be the same, or more. Information about how this system works and how professional qualification equivalence is available can be obtained from the subject departments.</p>
<p>We may give credit for a course, or part of a course, that would exempt you from having to study that area again. The onus is on you to prove that your learning and experience matches the area for which exemption is claimed.</p>
<p>There may be subject areas for which course attendance is compulsory and credit exemption does not apply, but, equally, there may be areas of study for which credit may be gained purely on the basis of your prior academic achievements or experience.</p>
<p>It is possible to claim credit for up to 66.7% of any award. Please note that this does not apply to MPhil or PhD courses as they have their own process known as 'Advance Standing'. Please contact&nbsp;<a href="mailto:[email protected]">Postgraduate Research Admissions</a>&nbsp;for further details.</p>
<p>If you have any queries or would like to find out more about CATS or APL/APEL, please contact the APL Officer within the relevant faculty.</p>
<h2>When do the programmes start?</h2>
<p>The majority of postgraduate programmes commence in early October each year, although some allow students to enter in January/February or April/May. For specific start dates for your chosen programme, please consult the relevant section of the website, or contact Postgraduate Admissions, who will be able to help you.</p>
<h2>What is the deadline for applications?</h2>
<p>There are no specific deadlines for most applications made directly to us, although there are some exceptions (check your programme details). The University will accept applications throughout the year, but we would generally advise that you send in your application form by the end of July to ensure that you have time to make any funding and/or accommodation arrangements, and for documents such as transcripts and references to be obtained if not submitted with the application. This will also give you more time to meet any conditions we may potentially attach to an offer.&nbsp; Some courses have earlier application deadlines.&nbsp; Please check the deadline that applies to the programme you are interested in before you apply. There is a strict deadline for applications to Nutrition and Dietetics and Social Work. Please refer to the relevant course web pages.</p>
<p>The deadline for PGCE applications is set by the Graduate Teacher Training Register (GTTR).</p>
<h2>Students with Disabilities</h2>
<p>We are committed to a policy of equal opportunities for applicants with disabilities or specific needs. Although applications from all prospective students are considered according to the same entry criteria, those of you who declare a disability or specific need will also be considered on an individual basis. As some of our buildings are old and not purpose-built, they may not be suitable for those of you with restricted mobility.&nbsp;</p>
<p>However, we are continually working to improve access routes and other facilities on campus to assist physically disabled students during their programmes of study. Wherever possible, we try to make arrangements or adaptations as appropriate, within the existing restrictions placed upon us.</p>
<p><strong>Good luck with your application!</strong></p>
<p><a class="m-link m-link--primary" href="https://flow.chester.ac.uk/tkflow_U/Flow.aspx?f=appform1.kdt&amp;template=template5&amp;course=PGT&amp;theme=redmond">Apply Now</a></p>
<div class="m-callout">
<p>If you're interested in a course at University Centre Shrewsbury, <a href="http://ucshrewsbury.ac.uk/postgraduate/apply">find out more about the application process.</a></p>
</div>
  </div>
"""]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            item['require_chinese_en'] = remove_class(clear_lianxu_space([""" <div class="field-collection-view clearfix view-mode-full">
  <h3 class="field-course-type">
    Postgraduate Study  </h3>

  <ul><li>Bachelor's degree with 68% or above</li>
<li>East and West International Education (EWIE)/ Wiseway Global International Pre-Masters Programme at 60% or above</li>
<li>Dongfang International Centre for Education Exchange Top University Pre-Masters Programme at 60% or above</li>
<li>Applicants for the MBA should have 2 years work experience, although well qualified and motivated individuals without this will be considered</li>
</ul></div>  <div class="field-collection-view clearfix view-mode-full field-collection-view-final">
  <p><strong>Academic Requirements:</strong></p>
<ul><li>Master's degree with a recognised institution</li>
</ul><p><strong>English Requirements:</strong></p>
<ul><li><strong>IELTS: 6.5 (no less than 5.5 in any band)</strong></li>
</ul></div>"""]))
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            # department = response.xpath("//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()").extract()
            # item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            yield item
        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #5

Показать файл

Файл: LiverpoolHopeUniversity_P.py Проект: histudent/python_spider

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.lincoln.ac.uk/"
        item['university'] = "Liverpool Hope University"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        item['location'] = 'Hope Park, Liverpool, L16 9JD'
        print("===========================")
        print(response.url)
        try:
            # 专业
            programmelist = response.xpath(
                "//section[@id='pageContent']/div[@class='course_header']/h1//text()"
            ).extract()
            # print(programmelist)
            programmeStr = ''.join(programmelist)
            degree_type = ''.join(
                re.findall(r"\(.{1,10}\)|\(Postgraduate\sCertificate\)",
                           programmeStr.strip()))
            # print(degree_type)
            programme = programmeStr.replace(degree_type, "")
            item['programme_en'] = programme.title()
            item['degree_name'] = degree_type.replace("(",
                                                      "").replace(")",
                                                                  "").strip()
            print("item['programme_en']: ", item['programme_en'])
            print("item['degree_name']: ", item['degree_name'])

            duration = response.xpath(
                "//strong[contains(text(),'Duration')]//text()").extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_str = ''.join(duration)

            item['teach_time'] = getTeachTime(duration_str)
            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['teach_time'] = ", item['teach_time'])
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # //strong[contains(text(),'Start month')]
            start_date = response.xpath(
                "//strong[contains(text(),'Start month')]//text()").extract()
            clear_space(start_date)
            print("start_date: ", start_date)
            if '&' in ''.join(start_date):
                start_date_list = ''.join(start_date).split('&')
                print(start_date_list)
                for s in start_date_list:
                    item['start_date'] += getStartDate(s.strip()) + ","
            else:
                item['start_date'] = getStartDate(''.join(start_date))
            item['start_date'] = item['start_date'].strip().strip(",").strip()
            print("item['start_date'] = ", item['start_date'])

            overview = response.xpath("//div[@id='overview']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            modules = response.xpath("//div[@id='curriculum']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            entry_requirements = response.xpath(
                "//div[@id='entry_reqs']//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            assessment_en = response.xpath(
                "//div[@id='teaching_research']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            career_en = response.xpath("//div[@id='careers']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en']: ", item['career_en'])

            # //h2[contains(text(),'INTERNATIONAL TUITION FEES')]/following-sibling::p[1]
            tuition_fee = response.xpath(
                "//h2[contains(text(),'INTERNATIONAL TUITION FEES')]/following-sibling::p//text()"
            ).extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"£[\d,]+", ''.join(tuition_fee))
            # print("tuition_fee_re: ", tuition_fee_re)
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = int(tuition_fee_re[0].replace(
                    "£", "").replace(",", "").strip())
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            dep_dict = {
                "liverpool hope business school":
                "Faculty of Arts and Humanities",
                "creative and performing arts":
                "Faculty of Arts and Humanities",
                "english": "Faculty of Arts and Humanities",
                "fine and applied art": "Faculty of Arts and Humanities",
                "history and politics": "Faculty of Arts and Humanities",
                "law": "Faculty of Arts and Humanities",
                "media and communication": "Faculty of Arts and Humanities",
                "social sciences": "Faculty of Arts and Humanities",
                "theology, philosophy and religion":
                "Faculty of Arts and Humanities",
                "disability and education": "Faculty of Education",
                "early childhood": "Faculty of Education",
                "education studies": "Faculty of Education",
                "teacher education": "Faculty of Education",
                "geography and environmental science": "Faculty of Science",
                "mathematics and computer science": "Faculty of Science",
                "psychology": "Faculty of Science",
                "health sciences": "Faculty of Science",
            }
            department = response.xpath(
                "//div[contains(text(),'Department of')]//text()|//div[contains(text(),'School')]//text()"
            ).extract()
            clear_space(department)
            # print(department)
            department_key = ''.join(department).replace(
                "Department of", "").replace("School of", "").lower().strip()
            # print("department_key: ", department_key)
            item['department'] = dep_dict.get(department_key)
            # print("item['department']: ", item['department'])

            ielts_desc = re.findall(r".{1,20}IELTS.{1,40}",
                                    item['rntry_requirements'])
            # print("ielts_desc: ", ielts_desc)
            item['ielts_desc'] = ''.join(ielts_desc)

            ielts_list = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
            # print(ielts_list)
            if len(ielts_list) == 1:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[0]
                item['ielts_s'] = ielts_list[0]
                item['ielts_r'] = ielts_list[0]
                item['ielts_w'] = ielts_list[0]
            elif len(ielts_list) == 2:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[1]
                item['ielts_s'] = ielts_list[1]
                item['ielts_r'] = ielts_list[1]
                item['ielts_w'] = ielts_list[1]
            elif len(ielts_list) == 3:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[0]
                item['ielts_s'] = ielts_list[0]
                item['ielts_r'] = ielts_list[1]
                item['ielts_w'] = ielts_list[2]
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            toefl_desc = re.findall(r"TOEFL.{1,40}",
                                    item['rntry_requirements'])
            # print("toefl_desc: ", toefl_desc)
            item['toefl_desc'] = ''.join(toefl_desc)

            toefl_list = re.findall(r"\d\d+", item['toefl_desc'])
            # print(toefl_list)
            if len(toefl_list) == 1:
                item['toefl'] = toefl_list[0]
            elif len(toefl_list) == 2:
                item['toefl'] = toefl_list[0]
                item['toefl_l'] = toefl_list[1]
                item['toefl_s'] = toefl_list[1]
                item['toefl_r'] = toefl_list[1]
                item['toefl_w'] = toefl_list[1]
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            item[
                'apply_proces_en'] = "http://www.hope.ac.uk/postgraduate/howtoapply/"
            item[
                'require_chinese_en'] = """<h3>2018 Postgraduate Entry Requirements</h3><ul><li>A degree from a recognised institution equivalent to a UK Honours degree</li></ul>"""
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #6

Показать файл

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "University of Reading"
        # item['country'] = 'England'
        # item['website'] = 'http://www.reading.ac.uk/'
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        try:
            # 专业、学位类型、ucas_code
            programmeDegree_typeUcascode = response.xpath(
                "//span[@class='text-bg-standout text-nice-wrap']/text() | //h1[@id='heading']//text() | //h1[@class='hero-heading']//text() | //h1[@class='block-heading block-heading-l5 block-heading-b5 block-heading-md-l-reset cell-md-t0']//text()"
            ).extract()
            clear_space(programmeDegree_typeUcascode)
            programmeDegree_typeUcascode = ''.join(
                programmeDegree_typeUcascode).strip()
            # print("programmeDegree_typeUcascode: ", programmeDegree_typeUcascode)

            degree_type = re.findall(r"^\w+/\w+", programmeDegree_typeUcascode)
            if len(degree_type) == 0:
                degree_type = re.findall(r"^\w+", programmeDegree_typeUcascode)
            # print("degree_type: ", degree_type)
            item['degree_name'] = ''.join(degree_type)
            print("item['degree_name']: ", item['degree_name'])

            programme = programmeDegree_typeUcascode.replace(
                item['degree_name'], '').strip()
            item['programme_en'] = programme.title()
            # print("item['programme_en']: ", item['programme_en'])

            # duration
            durationMode = response.xpath(
                "//h2[@class='row-margin-small text-weight-medium text-size-25']/text() | //strong[contains(text(),'Duration')]/../text() | //h3[contains(text(),'Programme length:')]/following-sibling::p[1]//text()"
            ).extract()
            clear_space(durationMode)
            # print("durationMode: ", durationMode)
            durationMode = ''.join(durationMode)
            # if ":" in durationMode:
            #     duration = durationMode.split(":")[-1].strip()
            #     mode = durationMode.split(":")[0].strip()
            #     item['duration'] = duration
            duration_list = getIntDuration(''.join(durationMode))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            item['teach_time'] = getTeachTime(''.join(durationMode))
            # print("item['duration']: ", item['duration'])
            # print("item['teach_time']: ", item['teach_time'])
            # print("item['duration_per']: ", item['duration_per'])

            start_date = response.xpath(
                "//p[@class='headline'][contains(text(), 'Start date')]//text()"
            ).extract()
            # print(start_date)
            item['start_date'] = getStartDate(''.join(start_date))
            # print("item['start_date']: ", item['start_date'])

            overview2 = response.xpath(
                "//div[@class='m-bg-white m-pad-around m-pull-left-normal m-pull-up']//div[@class='theme-editor'] | //div[@id='top-courseOverview'] | //html//div[@id='top-programmeOverview']/h2[1]/following-sibling::div[1] | //div[@id='tc1']"
            ).extract()
            overview = remove_class(clear_lianxu_space(overview2))
            item['overview_en'] = overview
            print("item['overview_en']: ", item['overview_en'])

            # department
            department = response.xpath(
                "//article[@class='pad-around bg-white']//div[@class='theme-editor']//a//text()|//p[@class='paddingtop22 nopaddingbottom']//a//text()|//a[@class='navbar-brand navbar-brand-hbs']//text()"
            ).extract()
            clear_space(department)
            if department == "":
                item['department'] = response.meta.get('department')
            else:
                item['department'] = ', '.join(department).strip()
            item['department'] = item['department'].replace("How to apply", "")
            # print("item['department']: ", item['department'])

            item[
                'location'] = "Whiteknights,PO Box 217,Reading, Berkshire,RG6 6AH"
            # //h2[@id='Panel1Trigger']/../..
            entry_requirements = response.xpath(
                "//h2[@id='Panel1Trigger']/../..//text()|//div[@id='bottom-entryRequirements']/..//text()|//div[@id='tc5']//text()"
            ).extract()
            if len(entry_requirements) == 0:
                entry_requirements = response.xpath(
                    "//h4[contains(text(),'Entry requirements:')]/preceding-sibling::*[1]/following-sibling::*[position()<4]//text()"
                ).extract()
            clear_space(entry_requirements)
            entry = ''.join(entry_requirements)
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            if item['rntry_requirements'] == "":
                print("rntry_requirements 为空")
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts = re.findall(r"IELT.{1,100}", entry)
            # ielts = response.xpath(
            #     "//strong[contains(text(),'IELTS')]/..//text()").extract()
            # # if item['ielts_desc'] == "":
            clear_space(ielts)
            item['ielts_desc'] = ''.join(ielts).strip()
            # if item['ielts_desc'] == "":
            #     print("ielts_desc 为空")
            # print("item['ielts_desc']1: ", item['ielts_desc'])
            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            toefl = re.findall(r"TOEFL[\s\(\)\w:\.]{1,300}", entry)
            # print(ielts)
            if item['toefl_desc'] == "":
                item['toefl_desc'] = ''.join(toefl)
            # print("item['toefl_desc']: ", item['toefl_desc'])
            toeflDict = get_toefl(item['toefl_desc'])
            item['toefl'] = toeflDict.get("TOEFL")
            item['toefl_l'] = toeflDict.get("TOEFL_L")
            item['toefl_s'] = toeflDict.get("TOEFL_S")
            item['toefl_r'] = toeflDict.get("TOEFL_R")
            item['toefl_w'] = toeflDict.get("TOEFL_W")
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            # //h2[@id='Panel1Trigger']/../..
            modules = response.xpath(
                "//h2[@id='Panel2Trigger']/../..|//div[@id='bottom-courseContent']/..|//div[@id='page_content_wrap']/following-sibling::div[position()<3]|//strong[contains(text(),'Programme structure')]/../following-sibling::*"
            ).extract()
            if len(modules) == 0:
                modules = response.xpath(
                    "//h4[contains(text(),'Programme structure and content')]/preceding-sibling::*[1]/following-sibling::*[position()<11]"
                ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            print("item['modules_en']: ", item['modules_en'])

            # //h2[@id='Panel1Trigger']/../..
            career = response.xpath(
                "//h2[@id='Panel4Trigger']/../following-sibling::div[1]|//div[@id='bottom-careers']/..|//div[@id='careers']|//h3[contains(text(),'Careers')]/.."
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            print("item['career_en']: ", item['career_en'])

            # //h3[@class='row-margin-small text-weight-medium'][contains(text(),'How much will it cost?')]/following-sibling::p[2]
            tuition_fee = response.xpath(
                "//h3[@class='row-margin-small text-weight-medium'][contains(text(),'How much will it cost?')]/following-sibling::p[2]//text()|"
                "//html//div[@id='bottom-feesFunding']//tr[2]/td[3]//text()|"
                "//html//div[@id='bottom-feesFunding']//tr[2]/td[2]//text()|"
                "//html//div[@id='tc2']//h3[1]/following-sibling::p[1]//text()|"
                "//*[contains(text(),'Programme fee')]/following-sibling::*[1]//text()|"
                "//h2[contains(text(),'Fees')]/following-sibling::p[1]//h2[contains(text(),'Fees')]/following-sibling::p[1]|"
                "//h2[contains(text(),'Fees')]/following-sibling::p[position()<3]//text()|"
                "//p[contains(text(),'New international students:')]//text()"
            ).extract()
            clear_space(tuition_fee)
            # print(tuition_fee)
            # item['tuition_fee'] = ''.join(tuition_fee).strip()
            tuition_fee_re = re.findall(r"\d+,\d+", ''.join(tuition_fee))
            # print(tuition_fee_re)
            if len(tuition_fee_re) == 1:
                item['tuition_fee'] = int(''.join(tuition_fee_re[0]).replace(
                    "£", "").replace(",", "").strip())
                item['tuition_fee_pre'] = "£"
            if len(tuition_fee_re) >= 2:
                item['tuition_fee'] = int(''.join(tuition_fee_re[1]).replace(
                    "£", "").replace(",", "").strip())
                item['tuition_fee_pre'] = "£"
            # if item['tuition_fee'] is None:
            #     print("tuition_fee 为空")
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //div[@id='top-howWeTeachYou']
            assessment_en = response.xpath(
                "//div[@id='top-howWeTeachYou']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            item[
                'apply_proces_en'] = """<div><h1><span>How to apply for postgraduate courses</span></h1></div><div><h4>Postgraduate taught courses</h4><p>The quickest and easiest away to apply for postgraduate study at the University of Reading is through our <a>online application service</a>. The online service allows you to complete your application form and attach electronic copies of your academic transcripts, certificates and other supporting information. It also provides a tool for sending an email request to your referees, enabling them to send your supporting references directly to us.</p><p><span>If you are unable to apply online you can request a paper application form by telephoning </span><a>+44 (0) 118 378 5289</a><span> or writing to:</span></p><p>Admissions Office<br>University of Reading<br>Miller Building<br>Whiteknights<br>Reading, RG6 6AB<br>UK</p><h4>PGCE and School Direct</h4><p>Candidates for the PGCE and School Direct courses should submit an application via <a>UCAS Teacher Training</a>.</p><strong> Postgraduate research </strong><p>For more information on applying for postgraduate research opportunities, please visit our <a>graduate school website</a>.</p></div><div><div><div><h4>Entry requirements</h4><p>Please visit our <a>postgraduate entry requirements</a> page for information on academic qualifications and English language requirements.</p><h4>When to apply</h4><p>There is no specific deadline date for most courses and applications will be considered until the course is full. However, to allow time for us to process your application we recommend that you apply by the following dates for admission in September:</p><div><strong>UK applicants</strong> by 1 August</div><div><strong>International applicants</strong> by 1 June</div><div><br></div><p>Please note that the MSc Speech and Language Therapy has an earlier application deadline of 1 December. Applications for PGCE courses are made through UCAS (see above) and the deadline is 15 September of the year of entry though early applications are recommended.</p><p>Most of our taught courses start at the beginning of the autumn term (in September) but there are a number that also have a start at a different time of the year or have multiple starts throughout the year. Please see the individual subject pages for further details.</p><h4>After you apply</h4><p>As soon as you have submitted your&nbsp;completed application we will send&nbsp;you an email acknowledgement.&nbsp;We will also create an applicant&nbsp;account for you which will allow&nbsp;you to check on the progress of&nbsp;your application online and access&nbsp;other useful information about&nbsp;the University of Reading.</p><p> We aim to reach a decision on&nbsp;your application within 4 weeks.&nbsp;The length of time taken to reach&nbsp;a decision will vary as each&nbsp;application is considered on an individual basis according to your&nbsp;relevant strengths and merits. Once your application has been&nbsp;considered you will receive an&nbsp;email from the Admissions Office&nbsp;informing you of the decision. If&nbsp;your application has been successful,&nbsp;our email will explain the offer and&nbsp;any conditions attached to it and also&nbsp;give further details of the fees and&nbsp;other expenses associated with your&nbsp;course.&nbsp;</p><p>Our team of experienced&nbsp;admissions staff is here to help you&nbsp;throughout the application process&nbsp;so please do not hesitate to get in&nbsp;touch with us if you need any help&nbsp;with completing your application or&nbsp;have a question about the progress&nbsp;of your application. You can contact&nbsp;us at <a>[email protected]</a>.</p></div></div>"""

            yield item
        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #7

Показать файл

Файл: UniversityOfPlymouth_P.py Проект: histudent/python_spider

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.plymouth.ac.uk/"
        item['university'] = "University of Plymouth"
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===============================")
        print(response.url)
        try:
            # //span[@class='course-title']
            programme = response.xpath(
                "//span[@class='course-title']//text()").extract()
            clear_space(programme)
            item['programme_en'] = ''.join(programme).strip()
            # print("item['programme_en'] = ", item['programme_en'])

            degree_type = response.xpath(
                "//h1[@class='hero-heading']/text()").extract()
            clear_space(degree_type)
            item['degree_name'] = ''.join(degree_type).strip()
            # print("item['degree_name'] = ", item['degree_name'])

            degree_name_lower = item['degree_name'].lower()
            # print("degree_name_lower: ", degree_name_lower)
            if "phd" in degree_name_lower:
                item['teach_type'] = 'phd'
                item['degree_type'] = 3
            elif "res" in degree_name_lower:
                item['teach_type'] = 'research'
                item['degree_type'] = 3
            # print("item['teach_type'] = ", item['teach_type'])
            # print("item['degree_type'] = ", item['degree_type'])

            department = response.xpath(
                "//h2[@class='school-title']//text()").extract()
            clear_space(department)
            item['department'] = ''.join(department)
            # print("item['department'] = ", item['department'])

            # 课程长度
            duration = response.xpath("//td[contains(text(),'Duration')]/following-sibling::td//text()").extract()
            clear_space(duration)
            # print(duration)
            duration_list = getIntDuration(''.join(duration))
            # print("duration_list: ", duration_list)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # mode
            mode = response.xpath("//td[contains(text(),'Course type')]/following-sibling::td//text()").extract()
            clear_space(mode)
            # print("mode: ", mode)
            item['teach_time'] = getTeachTime(''.join(mode))
            # print("item['teach_time'] = ", item['teach_time'])

            # location
            location = response.xpath("//td[contains(text(),'Location')]/following-sibling::td//text()").extract()
            clear_space(location)
            item['location'] = ''.join(location).strip()
            # print("item['location'] = ", item['location'])

            # overview
            overview1 = response.xpath("//div[@class='overview']").extract()
            overview2 = response.xpath("//div[@id='key-features-accordion']").extract()
            overview = remove_class(clear_lianxu_space(overview1)) + remove_class(clear_lianxu_space(overview2))
            item['overview_en'] = overview
            # print("item['overview_en'] = ", item['overview_en'])

            # modules
            modules = response.xpath("//div[@id='structure-accordion']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en'] = ", item['modules_en'])

            # entry_requirements
            entry_requirements = response.xpath("//div[@id='entry-requirements-accordion']//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])

            # .{1,150}IELTS.{1,150}
            IELTS = re.findall(r"(.{1,80}IELTS.{1,80})|(.{1,80}ILETS.{1,80})|(.{1,80}IELTs.{1,80})", item['rntry_requirements'])
            # print(IELTS)
            if len(IELTS) != 0:
                ielts = ''.join(list(IELTS[0])).strip()
                item['ielts_desc'] = ielts
            print("item['ielts_desc'] = ", item['ielts_desc'])

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            if item['ielts'] != None:
                item['ielts'] = item['ielts'].strip(".").strip()
            if item['ielts_l'] != None:
                item['ielts_l'] = item['ielts_l'].strip(".").strip()
            if item['ielts_s'] != None:
                item['ielts_s'] = item['ielts_s'].strip(".").strip()
            if item['ielts_r'] != None:
                item['ielts_r'] = item['ielts_r'].strip(".").strip()
            if item['ielts_w'] != None:
                item['ielts_w'] = item['ielts_w'].strip(".").strip()
            print("item['IELTS'] = %sitem['IELTS_L'] = %sitem['IELTS_S'] = %sitem['IELTS_R'] = %sitem['IELTS_W'] = %s==" % (
                    item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            # how_to_apply
            how_to_apply = response.xpath("//div[@id='how-to-apply-accordion']").extract()
            item['apply_proces_en'] = remove_class(clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en'] = ", item['apply_proces_en'])

            # //html//div[@class='course-accordions']//tr[3]/td[3]
            # how_to_apply
            tuition_fee = response.xpath("//strong[contains(text(),'International')]/../following-sibling::*[2]//text()").extract()
            clear_space(tuition_fee)
            # print(tuition_fee)
            tuition_fee_str = ''.join(tuition_fee)
            if tuition_fee_str == "To be confirmed" or tuition_fee_str == "":
                item['tuition_fee'] = None
            else:
                item['tuition_fee'] = int(tuition_fee_str.replace("£", "").replace(",", "").strip())
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee'] = ", item['tuition_fee'])
            # print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            # https://www.plymouth.ac.uk/international/study/international-students-country-guides/asia/china
            item['require_chinese_en'] = """<p><b>Postgraduate</b></p><p>For postgraduate programmes, you'll need either a bachelor's degree (with high grades), a masters degree from a ranked Chinese university or a good honours degree from a British university.&nbsp;</p><p><div class="table-responsive">
<table>
<tr>
<td><b>Chinese degree classification - prestigious i</b><b>nstitution</b></td>
<td><b>Chinese degree classification - non-prestigious institution</b></td>
<td><b>Chinese degree classification - college institution</b></td>
<td><b>UK degree equivalent</b></td>
<td></td>
</tr>
<tr>
<td>80%</td>
<td>85%</td>
<td>90%</td>
<td>1st</td>
<td></td>
</tr>
<tr>
<td>75%</td>
<td>80%</td>
<td>85%</td>
<td>2:1</td>
<td></td>
</tr>
<tr>
<td>70%</td>
<td>75%</td>
<td>80%</td>
<td>2:2</td>
<td></td>
</tr>
</table></div>"""
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #8

Показать файл

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "University of Surrey"
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===============================")
        print(response.url)
        try:

            # 专业、学位类型
            programme_en = response.xpath(
                "//h1[@class='text-center my-0']//text()").extract()
            programme_en_list = ''.join(programme_en).split("\n")
            # print(programme_en_list)
            if len(programme_en_list) > 1:
                item['programme_en'] = programme_en_list[0].strip()
                item['degree_name'] = ''.join(programme_en_list[1:]).strip()
            print("item['programme_en'] = ", item['programme_en'])
            print("item['degree_name'] = ", item['degree_name'])

            overview = response.xpath(
                "//h3[@class='px-3 pt-1 text-white'][contains(text(),'Course facts')]/../preceding-sibling::*"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en'] = ", item['overview_en'])

            teach_time = response.xpath(
                "//td[@headers='view-field-study-mode-table-column'][contains(text(),'Full-time')]//text()"
            ).extract()
            item['teach_time'] = getTeachTime(''.join(teach_time))
            # print("item['teach_time'] = ", item['teach_time'])

            duration = response.xpath(
                "//td[@headers='view-field-study-mode-table-column'][contains(text(),'Full-time')]/following-sibling::*[1]//text()"
            ).extract()
            clear_space(duration)
            # print(duration)
            if len(duration) != 0:
                duration_list = getIntDuration(''.join(duration))
                # print("duration_list: ", duration_list)
                if len(duration_list) == 2:
                    item['duration'] = duration_list[0]
                    item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            start_date = response.xpath(
                "//td[@headers='view-field-study-mode-table-column'][contains(text(),'Full-time')]/following-sibling::*[last()]//text()"
            ).extract()
            # print("start_date: ", start_date)
            item['start_date'] = getStartDate(''.join(start_date))
            # print("item['start_date'] = ", item['start_date'])

            item[
                'location'] = '01SE01, Senate House, University of Surrey, Guildford, Surrey GU2 7XH'
            # print("item['location'] = ", item['location'])

            career = response.xpath(
                "//h2[contains(text(),'Professional development')]/preceding-sibling::*[1]/following-sibling::*[position()<last()-1]|"
                "//h2[contains(text(),'Professional recognition')]|//h2[contains(text(),'Professional recognition')]/following-sibling::*[position()<3]|"
                "//h2[contains(text(),'Careers')]|//h2[contains(text(),'Careers')]/following-sibling::*[position()<3]|"
                "//h2[contains(text(),'Industrial placement')]|//h2[contains(text(),'Industrial placement')]/following-sibling::*[position()<4]"
            ).extract()
            if len(career) == 0:
                career = response.xpath(
                    "//h2[contains(text(),'Career prospects')]/preceding-sibling::*[1]/following-sibling::*[position()<last()-1]"
                ).extract()
                if len(career) == 0:
                    career = response.xpath(
                        "//h2[contains(text(),'Graduate prospects')]/preceding-sibling::*[1]/following-sibling::*[position()<last()-1]"
                    ).extract()
            # print(career)
            item['career_en'] = remove_class(clear_lianxu_space(career))
            print("item['career_en'] = ", item['career_en'])

            modules = response.xpath(
                "//div[@class='module-list']/preceding-sibling::*").extract()
            modules1 = response.xpath("//div[@id='modules-ft']").extract()
            item['modules_en'] = remove_class(
                clear_lianxu_space(modules)) + remove_class(
                    clear_lianxu_space(modules1))
            if item['modules_en'] == "":
                item['modules_en'] = remove_class(
                    clear_lianxu_space(
                        response.xpath(
                            "//h2[contains(text(),'Modules')]/following-sibling::p[position()<3]"
                        ).extract()))
            # print("item['modules_en'] = ", item['modules_en'])

            entry_requirements = response.xpath(
                "//div[@id='entry-collapse']/*//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])

            ielts_str = response.xpath(
                "//h2[contains(text(),'English language requirements')]/following-sibling::p[position()<4]//text()"
            ).extract()
            ielts_re = re.findall(r"^IELTS.{1,80}", ''.join(ielts_str))
            # print(ielts_re)
            item['ielts_desc'] = ''.join(ielts_re)

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['IELTS'] = %sitem['IELTS_L'] = %sitem['IELTS_S'] = %sitem['IELTS_R'] = %sitem['IELTS_W'] = %s==" % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            tuition_fee = response.xpath(
                "//div[@id='fees-collapse']//td[@headers='view-field-study-mode-table-column--2'][contains(text(),'Full-time')]/following-sibling::*[last()]//text()"
            ).extract()
            # print(tuition_fee)
            if len(tuition_fee) > 0 and "£" in "".join(tuition_fee):
                item['tuition_fee'] = int(''.join(tuition_fee[0]).replace(
                    "£", "").replace(",", "").strip())
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee'] = ", item['tuition_fee'])
            print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            how_to_apply_url = response.xpath(
                "//span[@class='studymode'][contains(text(), 'Full-time')]/following-sibling::span[@class='applink']/a/@href"
            ).extract()
            # print(how_to_apply_url)
            if len(how_to_apply_url) > 0:
                how_to_apply_url = ''.join(how_to_apply_url[0])
                # print(how_to_apply_url)
                item['apply_proces_en'] = self.parse_apply_proces_en(
                    how_to_apply_url)
            print("item['apply_proces_en'] = ", item['apply_proces_en'])

            # https://www.surrey.ac.uk/china/entry-requirements
            item['require_chinese_en'] = """<h2>Postgraduate</h2>
<p>To apply for one of our postgraduate courses that require a UK 2:1, you must achieve between 75-85% overall.</p>
<p>For courses that require a UK 2:2, you must achieve between 70-80% overall.</p>
<p>For courses that require a UK first-class degree to be eligible for a scholarship, you must achieve between 80-90% overall.</p>
"""

            department_dict = {}
            department1_list = [
                "Criminology",
                "Criminology and Sociology",
                "Law with Criminology",
                "Media, Culture and Society",
                "Media Studies with Film Studies",
                "Politics and Sociology",
                "Sociology",
                "Criminology and Social Research",
                "Criminology and Social Research (Corporate Crime and Corporate Responsibility)",
                "Criminology and Social Research (Cybercrime and Cybersecurity)",
                "Social Research Methods",
                "Sociology",
                "Economics",
                "Business Economics",
                "Economics and Finance",
                "Economics and thetics",
                "Economics",
                "Business Economics and Finance",
                "Economics",
                "Economics and Finance",
                "International Economics, Finance and Development",
                "Economics (Four Year)",
                "Law",
                "Law with Criminology",
                "Law with International Relations",
                "International Commercial Law",
                "Law",
                "Accounting and Finance",
                "Business and Retail nagement",
                "Business nagement",
                "Business nagement (Entrepreneurship)",
                "Business nagement (HRM)",
                "Business nagement (rketing)",
                "International Business nagement",
                "Accounting and Finance",
                "Business Administration",
                "Business Analytics",
                "Corporate Finance",
                "Entrepreneurship",
                "Hun Resources nagement",
                "International Business nagement",
                "International Financial nagement",
                "International rketing nagement",
                "International Retail rketing in the Digital Environment",
                "Investment nagement",
                "nagement Education",
                "rketing nagement",
                "Occupational and Organizational Psychology",
                "Operations and Supply Chain in the Digital Era",
                "nagement and Business",
                "Creative Music Technology",
                "Digital Media Arts",
                "Film and Video Production Technology",
                "Music",
                "Music and Sound Recording (Tonmeister)",
                "Music (Composition)",
                "Music (Conducting)",
                "Music (Creative Practice)",
                "Music (Musicology)",
                "Music (Perfornce)",
                "Digital Media Arts",
                "Music",
                "Sound Recording",
                "English Literature with Politics",
                "International Relations",
                "Politics",
                "Politics and Economics",
                "Politics and Sociology",
                "Public Affairs",
                "International Relations",
                "Public Affairs",
                "International Event nagement",
                "International Hospitality and Tourism nagement",
                "International Hospitality nagement",
                "International Tourism nagement",
                "Air Transport nagement",
                "International Events nagement",
                "International Events nagement (Eurosters)",
                "Eurosters",
                "International Hospitality nagement (Eurosters)",
                "International Hotel nagement",
                "International Tourism nagement",
                "International Tourism nagement (Eurosters)",
                "Eurosters",
                "Strategic Hotel nagement",
                "Strategic Tourism nagement and rketing",
                "Hospitality and Tourism nagement",
                "English Literature",
                "English Literature and French",
                "English Literature and Gern",
                "English Literature and Spanish",
                "English Literature with Creative Writing",
                "English Literature with Film Studies",
                "English Literature with Politics",
                "English Literature with Sociology",
                "Creative Writing",
                "Creative Writing",
                "English Literature",
                "Creative Writing",
                "English Literature",
                "Business nagement and French",
                "Business nagement and Gern",
                "Business nagement and Spanish",
                "English Literature and French",
                "English Literature and Gern",
                "English Literature and Spanish",
                "Modern Languages (French and Gern)",
                "Modern Languages (French and Spanish)",
                "Modern Languages (Gern and Spanish)",
                "Communication and International rketing",
                "Intercultural Communication with International Business",
                "Interpreting",
                "Interpreting (Chinese Pathway)",
                "Teaching English to Speakers of Other Languages (TESOL)",
                "Translation",
                "Translation and Interpreting",
                "Translation and Interpreting Studies",
                "Film Studies",
                "Linguistics",
                "Literary and Cultural Studies",
                "Translation and Interpreting",
                "Acting",
                "Actor-Musician",
                "Dance",
                "Musical Theatre",
                "Theatre",
                "Theatre and Perfornce",
                "Theatre Production",
                "Acting",
                "Musical Theatre",
                "Stage and Production nagement",
                "Theatre",
                "Acting",
                "Musical Theatre",
                "Dance",
                "Theatre",
            ]
            department1_list = list(set(department1_list))
            department1_value = "Faculty of Arts and Social Sciences"
            for d in department1_list:
                department_dict[d.lower()] = department1_value

            department2_list = [
                "Practitioner Doctorate in Sustainability",
                "Environment and Sustainability",
                "Corporate Environmental Management",
                "Environmental Strategy",
                "Sustainable Development",
                "Chemistry",
                "Chemistry",
                "Chemistry",
                "Chemistry with Forensic Investigation",
                "Medicinal Chemistry",
                "Mathematics",
                "Mathematics with Statistics",
                "Mathematics with Music",
                "Financial Mathematics",
                "Mathematics and Physics",
                "Economics and Mathematics",
                "Mathematics",
                "Mathematics and Physics",
                "Physics",
                "Physics with Astronomy",
                "Physics with Nuclear Astrophysics",
                "Physics with Quantum Technologies",
                "Medical Physics",
                "Nuclear Science and Applications",
                "Physics",
                "Radiation and Environmental Protection",
                "Physics",
                "Information Systems",
                "Information Security",
                "Advanced Materials",
                "Biomedical Engineering",
            ]
            department2_list = list(set(department2_list))
            department2_value = "Faculty of Engineering and Physical Sciences"
            for d in department2_list:
                department_dict[d.lower()] = department2_value

            department3_list = [
                "Nutrition",
                "Nutrition and Dietetics",
                "Nutrition and Food Science",
                "Human Nutrition",
                "Nutritional Medicine",
                "International English Language Testing System (IELTS)",
                "Developmental Psychology in Research and Practice",
                "Health Psychology",
                "Psychology (Conversion)",
                "Primary and Community Care (SPA Community Children's Nursing)",
                "Primary and Community Care (SPA District Nursing)",
                "Primary and Community Care (SPA General Practice Nursing)",
                "Public Health Practice (SCPHN Health Visiting)",
                "Public Health Practice (SCPHN School Nursing)",
                "Advanced Clinical Practice",
                "Advanced Practitioner (Primary and Community Care)",
                "Advanced Practitioner (Public Health Practice)",
                "Education for Health Professionals",
                "Education for Professional Practice",
                "Healthcare Practice",
                "Leadership and Healthcare",
                "Physician Associate",
                "Primary and Community Care (SPA Community Children's Nursing)",
                "Primary and Community Care (SPA District Nursing)",
                "Primary and Community Care (SPA General Practice Nursing)",
                "Public Health Practice (SCPHN Health Visiting)",
                "Public Health Practice (SCPHN School Nursing)",
            ]
            department3_list = list(set(department3_list))
            department3_value = "Faculty of Health and Medical Sciences"
            for d in department3_list:
                department_dict[d.lower()] = department3_value

            item['department'] = department_dict.get(
                item['programme_en'].lower())
            print("item['department: ", item['department'])
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #9

Показать файл

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.strath.ac.uk/"
        item["university"] = "University of Strathclyde"
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        item['location'] = "16 Richmond Street, Glasgow, G1 1XQ"
        print("===========================")
        print(response.url)
        try:
            # 学位类型
            degree_type = response.xpath(
                "//main[@id='content']/section[@class='PGtPage']/header[@class='page-summary has-img']/div[@class='wrap']/h1/span/text()"
            ).extract()
            item['degree_name'] = ''.join(degree_type).strip()
            print("item['degree_name'] = ", item['degree_name'])

            # 专业名
            programme = response.xpath(
                "//main[@id='content']/section[@class='PGtPage']/header[@class='page-summary has-img']/div[@class='wrap']/h1/text()"
            ).extract()
            # print("programme = ", programme)
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en'] = ", item['programme_en'])

            if "Engineering" in item['programme_en']:
                item['department'] = "Faculty of Engineering"
            elif "Science" in item['programme_en']:
                item['department'] = "Faculty of Science"
            elif "Business" in item['programme_en'] or "Finance" in item[
                    'programme_en'] or "Marketing" in item['programme_en']:
                item['department'] = "Strathclyde Business School"
            print("item['department'] = ", item['department'])

            # 课程长度、开学时间、截止日期
            durationTeachtime = response.xpath(
                "//b[contains(text(),'Study mode and duration')]/../text()"
            ).extract()
            clear_space(durationTeachtime)
            # print("durationTeachtime: ", durationTeachtime)
            durationTeachtimeStr = ''.join(durationTeachtime)

            item['teach_time'] = getTeachTime(durationTeachtimeStr)
            duration_list = getIntDuration(durationTeachtimeStr)
            # print(duration_list)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])
            # print("item['teach_time'] = ", item['teach_time'])

            start_date = response.xpath(
                "//b[contains(text(),'Start date')]/../text()").extract()
            start_date_str = ''.join(start_date).replace(":", "")
            print("start_date_str = ", start_date_str)
            item['start_date'] = getStartDate(start_date_str)
            if item['start_date'] != "" and item[
                    'start_date'] > "06" and "201" not in item['start_date']:
                item['start_date'] = "2018-" + item['start_date']
            elif item['start_date'] != "" and item[
                    'start_date'] <= "06" and "201" not in item['start_date']:
                item['start_date'] = "2019-" + item['start_date']
            print("item['start_date'] = ", item['start_date'])

            # 截止日期
            deadline = response.xpath(
                "//b[contains(text(),'Application deadline')]/../text()"
            ).extract()
            # print("deadline1 = ", deadline)
            deadline = ''.join(deadline).replace(":", "").strip()
            print("deadline = ", deadline)
            item['deadline'] = getStartDate(deadline)
            if item['deadline'] == '2':
                item['deadline'] = ""
            print("item['deadline'] = ", item['deadline'])

            # 专业描述
            overview = response.xpath(
                "//article[@id='why-this-course']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en'] = ", item['overview_en'])

            # 课程设置、评估方式
            modules = response.xpath(
                "//h3[contains(text(),'Learning & teaching')]/preceding-sibling::*"
            ).extract()
            if len(modules) == 0:
                modules = response.xpath(
                    "//article[@id='course-content']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            if item['modules_en'] == "":
                print("modules_en 为空")
            # else:
            #     print("item['modules_en'] = ", item['modules_en'])

            assessment_en = response.xpath(
                "//h3[contains(text(),'Learning & teaching')]/preceding-sibling::*[1]/following-sibling::*"
            ).extract()
            item["assessment_en"] = remove_class(
                clear_lianxu_space(assessment_en))
            if item['assessment_en'] == "":
                print("assessment_en 为空")
            # else:
            #     print("item['assessment_en'] = ", item['assessment_en'])

            # 学术要求、英语要求
            rntry_requirements = response.xpath(
                "//article[@id='entry-requirements']//text()").extract()
            item["rntry_requirements"] = clear_lianxu_space(rntry_requirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])

            # ielts = response.xpath("//h3[contains(text(),'English language requirements')]/following-sibling::*[position()<4]//text()").extract()
            # print("ielts: ", ielts)
            ielts_re = re.findall(r"IELTS.{1,80}", ''.join(rntry_requirements))
            # print("ielts_re = ", ielts_re)
            item["ielts_desc"] = ''.join(ielts_re)
            print("item['ielts_desc'] = ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            # print(ieltlsrw)
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            if item['ielts'] != None:
                item['ielts'] = item['ielts'].strip('.').strip()
            if item['ielts_l'] != None:
                item['ielts_l'] = item['ielts_l'].strip('.').strip()
            if item['ielts_s'] != None:
                item['ielts_s'] = item['ielts_s'].strip('.').strip()
            if item['ielts_r'] != None:
                item['ielts_r'] = item['ielts_r'].strip('.').strip()
            if item['ielts_w'] != None:
                item['ielts_w'] = item['ielts_w'].strip('.').strip()
            print(
                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            # 学费    //article[@id='fees-and-funding']/ul[3]/li
            tuition_fee = response.xpath(
                "//html//article[@id='fees-and-funding']/*[contains(text(),'International')]/following-sibling::*[1]//text()"
            ).extract()
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"£[\d,]+", ''.join(tuition_fee))
            # print(tuition_fee_re)
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = ''.join(tuition_fee_re[0]).replace(
                    "£", "").replace(",", "")
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee'] = ", item['tuition_fee'])

            # 就业    //article[@id='careers']
            career = response.xpath("//article[@id='careers']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en'] = ", item['career_en'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h3>Postgraduate</h3>
<div>GPA from a four-year undergraduate degree must be:</div>
<div>
<ul>
<li>over an average of 70% for 211/985 universities</li>
<li>over an average of 75% for the rest of Chinese universities</li>
</ul>
</div>
<div>Students interested in PhD must usually have a Masters and must include a proposal in their application.</div>
<div>For further information on entry requirements, you can contact our representative Lexy Docwra (<a href="mailto:[email protected]">[email protected]</a>).</div>"""
                ]))
            print("item['require_chinese_en'] = ", item['require_chinese_en'])

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<h2>Postgraduate application process</h2>
	<ul>
<li>choose the course you want to apply for &ndash; <a href="http://www.strath.ac.uk/courses/?level_ug=false&amp;level_pgt=true&amp;level_pgr=false">search our postgraduate taught courses</a></li>
<li>check the entry requirements for the course on the course page or in the prospectus</li>
<li>start your application online by clicking on the Apply button on the course page</li>
<li>submit your application along with all supporting documentation &ndash; see our document checklist below. Your application may be delayed if you fail to provide all the required documents</li>
<li>to help you fill in the application form please read our <a href="/media/ps/registry/Applicant_Guide_to_Postgraduate_Taught_Admissions.pdf.pagespeed.ce.p3pCAoLRJ3.pdf" title="" rel="external">Applicant Guide to Postgraduate Taught Admissions</a></li>
<li>once you&rsquo;ve submitted your personal information, you&rsquo;ll receive an email which contains your username and password. Please keep these in a safe place as you&rsquo;ll need them to progress with your application and view any decisions</li>
<li>your application will be considered by the relevant selection team. If they need any further information you&rsquo;ll be contacted</li>
<li>a decision will be made on your application &ndash; we try to make a decision on your application as quickly as possible. In most cases this will be within a minimum of 10 working days (two weeks)</li>
<li>you&rsquo;ll receive an email telling you that a decision has been made on your application. You&rsquo;ll be asked to log in to our online application system (PEGASUS) to view the outcome of your application</li>
</ul>"""
                ]))
            print("item['apply_proces_en'] = ", item['apply_proces_en'])

            item['apply_documents_en'] = remove_class(
                clear_lianxu_space([
                    """<h2>Document checklist</h2>
<p>Your application may be delayed if you fail to provide the following documents (where appropriate):</p>
<ul>
<li>certified copies of qualifications you&rsquo;ve gained, eg degree certificate and transcripts (showing the subjects taken and your grades). If you&rsquo;re still studying, provide a transcript of your results so far</li>
<li>if your qualifications are in a language other than English, please provide official translations in addition to the copies of the original documents</li>
<li>if English is not your first language, please provide a suitable English language test certificate (if appropriate), for example IELTS</li>
<li>a copy of your passport (if you are a non EU overseas applicant). Your passport is required in order to obtain your Certificate of Acceptance for Studies (CAS) statement which allows you to apply for your Tier 4 visa to study</li>
<li>a copy of your sponsor letter/scholarship award (if appropriate/available)&nbsp;</li>
<li>copies of any other documentation to support your application such as a CV, Personal Statement, Portfolio (for certain programmes)</li>
</ul>"""
                ]))
            print("item['apply_documents_en'] = ", item['apply_documents_en'])

            yield item
        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #10

Показать файл

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.uwl.ac.uk/"
        item['university'] = "University of West London"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        try:
            programmeDegreetype = response.xpath(
                "//h1[@id='page-title']//text()").extract()
            # print("programmeDegreetype: ", programmeDegreetype)
            programmeDegreetypeStr = ''.join(programmeDegreetype)

            degree_type = re.findall(r"^(\w+\s?/\s?\w+|\w+)\s",
                                     programmeDegreetypeStr)
            # print("degree_type: ", degree_type)
            item['degree_name'] = ''.join(degree_type).strip()
            print("item['degree_name']: ", item['degree_name'])

            if item['degree_name'].lower() == "phd":
                item['teach_type'] = 'phd'
                item['degree_type'] = 3
            print("item['teach_type']: ", item['teach_type'])
            print("item['degree_type']: ", item['degree_type'])

            programme = programmeDegreetypeStr.strip(''.join(degree_type))
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            mode = response.xpath(
                "//dt[contains(text(), 'Study mode')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(mode)
            item['teach_time'] = getTeachTime(''.join(mode))
            # print("item['teach_time']: ", item['teach_time'])

            location = response.xpath(
                "//dt[contains(text(), 'Location')]/following-sibling::dd[1]//text()"
            ).extract()
            item['location'] = ''.join(location).replace(
                "See location information", "").strip()
            print("item['location']: ", item['location'])

            start_date = response.xpath(
                "//dt[contains(text(), 'Start date')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            item['start_date'] = getStartDate(''.join(start_date))
            # print("item['start_date']: ", item['start_date'])

            duration = response.xpath(
                "//dt[contains(text(), 'Duration')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            department = response.xpath(
                "//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()"
            ).extract()
            item['department'] = ''.join(department).strip()
            print("item['department']: ", item['department'])

            tuition_fee = response.xpath(
                "//h4[contains(text(),'Overseas students')]/following-sibling::dl[1]//dt[contains(text(), 'Main fee')]/following-sibling::dd[1]//text()"
            ).extract()
            # print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //div[@id='course-detail']
            modules = response.xpath("//div[@id='course-detail']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            # //div[@id='course-detail']
            entry_requirements = response.xpath(
                "//div[@id='entry-requirements']//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts_desc = response.xpath(
                "//div[@id='entry-requirements']//*[contains(text(),'IELTS')]/text()"
            ).extract()
            clear_space(ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            ielts_desc_re = re.findall(r'.{1,50}IELTS.{1,50}',
                                       ''.join(ielts_desc))
            # print("ielts_desc_re: ", ielts_desc_re)
            if len(ielts_desc_re) > 0:
                item['ielts_desc'] = ielts_desc_re[-1]
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            how_to_apply = response.xpath(
                "//div[@id='how-to-apply']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            assessment_en = response.xpath(
                "//h3[contains(text(),'Teaching methods')]/preceding-sibling::*[1]/following-sibling::*[position()<5]|"
                "//*[contains(text(),'Assessment')]/preceding-sibling::*[1]/following-sibling::*[position()<5]|"
                "//html//div/strong[contains(text(),'How will I be taught?')]/.."
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            career_en = response.xpath(
                "//div[@id='career-progression-and-study']|"
                "//div[@id='jobs-and-placements']|"
                "//html//*[contains(text(),'Career and study progression')]/../following-sibling::*[position()<5]"
            ).extract()
            item['career_en'] = remove_class(
                clear_lianxu_space(career_en)).replace("<div></div>",
                                                       "").strip()
            print("item['career_en']: ", item['career_en'])

            overview_en = response.xpath(
                "//div[@id='course-summary']/*[position()<last()]").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            print("item['overview_en']: ", item['overview_en'])

            item['require_chinese_en'] = """<h3>Postgraduate entry</h3>
<p>Applicants with the followingqualiﬁcationswill be considered for entry on a postgraduate course:</p>
<p>Bachelor's degree from a national university with a GPA 2.6 / 4.0 or an overall average of 65% or higher</p>
<p>Bachelor's degree from a high-ranking private college with an average of 85% or higher</p>
<p>Honours degree from any university in the UK or Republic of Ireland with a minimum of 2:2 or above</p>
<p>To study a PhD: a proposal is required in addition to a Masters qualification in a related subject area.</p>"""
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #11

Показать файл

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.salford.ac.uk/"
        item['university'] = "University of Salford"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        item['location'] = 'The Crescent, Salford, M5 4WT, UK'
        print("===========================")
        print(response.url)
        try:
            # 专业、学位类型
            programme = response.xpath(
                "//div[@id='content']/div[@class='col-md-12']/div[@class='course-title']/div[@class='row']/div[@class='col-sm-8 col-md-8']/h1//text()"
            ).extract()
            item['programme_en'] = ''.join(programme)
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath(
                "//div[@id='content']/div[@class='col-md-12']/div[@class='course-title']/div[@class='row']/div[@class='col-sm-8 col-md-8']/h2//text()"
            ).extract()
            item['degree_name'] = ''.join(degree_type)
            print("item['degree_name']: ", item['degree_name'])

            # //div[@id='content']/div[@class='col-md-12']/div[@class='course-title']/div[@class='row']/div[@class='col-sm-8 col-md-8']/p
            department = response.xpath(
                "//strong[contains(text(), 'School -')]/../text()|"
                "//p[contains(text(),'This course is a collaboration between the followi')]/../following-sibling::*[1]//text()"
            ).extract()
            clear_space(department)
            item['department'] = ', '.join(department).replace(
                ', , ', ', ').strip().strip(',').strip()
            if item['department'] == "":
                print("***")
            print("item['department']: ", item['department'])

            start_date = response.xpath(
                "//strong[contains(text(), 'Start Date(s):')]/../text()"
            ).extract()
            clear_space(start_date)
            print("start_date: ", start_date)
            start_date = ''.join(start_date)
            if ";" in start_date:
                start_date_list = start_date.split(";")
                print(start_date_list)
                for s in start_date_list:
                    item['start_date'] += getStartDate(s.strip().lower()) + ","
            else:
                item['start_date'] = getStartDate(''.join(start_date).lower())
            item['start_date'] = item['start_date'].strip().strip(",").strip()
            print("item['start_date']: ", item['start_date'])

            duration = response.xpath(
                "//strong[contains(text(), 'Duration')]/../following-sibling::*[position()<3]//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_str = ''.join(duration)

            item['teach_time'] = getTeachTime(duration_str)
            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['teach_time'] = ", item['teach_time'])
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # //strong[contains(text(), 'Fees')]/../following-sibling::p[contains(text(), 'International -')]
            tuition_fee = response.xpath(
                "//strong[contains(text(), 'Fees')]/../following-sibling::p[contains(text(), 'International')]//text()"
            ).extract()
            clear_space(tuition_fee)
            print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"£\d+,\d+", ''.join(tuition_fee))
            # print("tuition_fee_re: ", tuition_fee_re)
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee_re))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee']: ", item['tuition_fee'])
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //div[@id='content']/div[@class='col-md-12']/div[@class='row']/div[1]
            overview = response.xpath(
                "//div[@id='content']/div[@class='col-md-12']/div[@class='row']/div[1] | //div[@id='content']/div[@class='row']/div[1]"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            # //section[@id='about']/div[@id='content']
            modules_en = response.xpath(
                "//div[@id='courseaccordion']").extract()
            if len(modules_en) == 0:
                # print("********")
                modules_en = response.xpath(
                    "//h2[contains(text(),'Course Details')]/following-sibling::*"
                ).extract()
            item['modules_en'] = remove_class(
                clear_lianxu_space(modules_en))  # .replace("&nbsp;", "")
            item['modules_en'] = item['modules_en'].encode('utf-8').decode(
                "unicode-escape").replace("Â ", "")
            # print("item['modules_en']: ", item['modules_en'])

            # //section[@id='requirements']/div
            entry_requirements = response.xpath(
                "//section[@id='requirements']/div//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            # 申请材料
            apply_documents_en = response.xpath(
                "//h3[contains(text(),'Applicant profile')]/preceding-sibling::*[1]/following-sibling::*"
            ).extract()
            item['apply_documents_en'] = remove_class(
                clear_lianxu_space(apply_documents_en))
            # print("item['apply_documents_en']: ", item['apply_documents_en'])

            # //h3[contains(text(),'English Language Requirements')]/following-sibling::*[1]
            ielts_desc = response.xpath(
                "//h3[contains(text(),'English Language Requirements')]/following-sibling::*[position()<3]//text()"
            ).extract()
            clear_space(ielts_desc)
            item['ielts_desc'] = ''.join(ielts_desc).replace(
                "Suitable For", "").strip()
            # print("item['ielts_desc']: ",item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            if item['url'] == "https://www.salford.ac.uk/pgt-courses/journalism-news-broadcast-sport":
                item['ielts_l'] = 6.0
                item['ielts_s'] = 6.0
                item['ielts_r'] = 6.0
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            # //section[@id='teaching']/div[@class='container main']/div[@class='col-md-12']/div[@id='teaching_0a19']
            assessment_en = response.xpath(
                "//section[@id='teaching']/div").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            # //section[@id='employability']/div[@class='container main']/div[@class='col-md-12']/div[@id='employ_0a19']
            career = response.xpath(
                "//section[@id='employability']/div").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div id="content_div_43743">
<h1>How to apply for a postgraduate taught degree</h1><p>You should complete your application online. Click the button below to get started. There is plenty of helpful information throughout the application process.</p><p>If you have all your supporting documents ready, it will only take about 20 minutes to complete the process. However, you can save your application at any stage and come back to it as many times as you like.</p>
</div>

<div id="new_content_container_1410668">
<div class="moneybox" id="new_div_48503">
<p><a href="http://webapps.ascentone.com/login.aspx?key=5D4B012A-BB6C-495B-B2E4-B5A56B3CCF00" class="btn btn-primary btn-large">Apply online here</a></p>
</div>
</div>

<div id="new_content_container_1410670">

</div>

<div id="new_div_48505">
<h2>What documents will I need?</h2><p>To complete the application process, you will need to upload scanned copies of your supporting documents. These documents vary from course to course, but usually include:</p><ul><li>One reference&nbsp;</li> <li>Transcripts or certificates demonstrating that you meet, or are likely to meet, the entry requirements for your course&nbsp;&nbsp;</li> <li>Evidence, <a href="http://www.salford.ac.uk/__data/assets/pdf_file/0018/104841/18-02-23-Vouch-List-Equivalent-qualifications-to-English-GCSE-Grade-C.pdf">if English is not your first language, that your command of English meets the standards required for postgraduate study</a> (an IELTS score of 6.5, or the equivalent, is the norm)&nbsp;&nbsp;</li> <li>A copy of your passport, if you are coming to us from outside the EU and will <a href="http://www.advice.salford.ac.uk/page/visa">require a student visa</a>.&nbsp;&nbsp;</li> <li>If you are applying for Applied Social Work Practice (MSc, PgDip or PgCert)&nbsp;you will also need to complete the <a href="http://www.salford.ac.uk/__data/assets/word_doc/0010/448768/Agency-Agreement.docx">Agency Sponsorship Form</a> and send it to <a href="mailto:[email protected]">[email protected]</a></li> <li>For the MA courses in Media Production you will be required to submit a project proposal related to your chosen specialist field, to support your application.&nbsp;&nbsp;A brief written synopsis (max. 500 words) of your ideas would also be required.&nbsp;&nbsp;Please note that this would be for discussion&nbsp;&nbsp;&nbsp;&nbsp;purposes at the interview only.&nbsp;&nbsp;</li></ul><p>You must ensure that you upload all the documents that are needed to support your application.&nbsp;&nbsp;If you do not provide us with the information we require to make a complete assessment your application this will delay our response to you.</p><h2>What if my documents aren't ready?</h2><p>If you have not yet finished a course, if you are currently studying towards a qualification and receive a conditional offer from us, once you have taken your exams, please ensure that you send copies of your transcripts and certificates to us as soon as possible to allow us to update your admission&nbsp;&nbsp;record.</p><p>Once you have completed your application form and submitted it, you will receive an email from us acknowledging receipt of your application. We aim to consider your application as soon as we can but this can vary depending on whether you are required to attend an interview.</p><h2>Deadlines</h2><p>Postgraduate courses may start at varying times throughout the year. You should&nbsp;&nbsp;submit your application at least one month prior to your chosen course starting date.</p><h2>Course application exceptions</h2><div><p>Applications that are&nbsp;<strong>an exception</strong> to our online application process are:&nbsp;&nbsp;</p> <ul><li><a href="http://www.ucas.com"><strong>MA Social Work full-time study via UCAS</strong></a></li> <li><strong><a href="http://www.salford.ac.uk/study/postgraduate/applying/applying-for-taught-courses/post-qualifying-applications-pg">Post qualifying Health and Social Care single modules</a></strong></li> <li><a href="http://www.unigis.org/uk-courses-introduction/uk-courses-how-apply"><strong>Geographical Information Systems are via our partners for this course Manchester Metropolitan University</strong></a></li> </ul></div><div><h2>Policy statement on equality and diversity&nbsp;&nbsp;</h2></div><p><a href="http://www.salford.ac.uk/study/postgraduate/applying/policy-statement-on-equality-and-diversity">Read our policy statement on equality and diversity</a></p>
</div>"""
                ]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<p><strong>Postgraduate</strong></p><p>(4 year) Bachelor degrees with a GPA 2.7/4.0 or 70% from a National University; or from a Project 211 University with a GPA 2.6/4.0 or 65%; or from a Private University with GPA 2.75/4.0 or 75%.</p>"""
                ]))
            # print("item['require_chinese_en']: ", item['require_chinese_en'])
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #12

Показать файл

Файл: CardiffMetropolitanUniversity_P.py Проект: histudent/python_spider

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.cardiffmet.ac.uk/"
        item['university'] = "Cardiff Metropolitan University"
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        item['location'] = 'Llandaff Campus, Western Avenue, Cardiff, CF5 2YB'
        # print("item['location'] = ", item['location'])
        print("===========================")
        print(response.url)
        try:
            # 专业、学位类型
            programmeDegreetype = response.xpath("//div[@id='ordercontainer']/span[@id='DeltaPlaceHolderMain']/div[@class='coursefullwidth']/div/h1//text()|//div[@class='cstcoursetitle']/h1/text()").extract()
            # print("programmeDegreetype: ", programmeDegreetype)
            clear_space(programmeDegreetype)
            programmeDegreetypeStr = ''.join(programmeDegreetype).replace("–", "-").strip()
            print("programmeDegreetypeStr: ", programmeDegreetypeStr)
            programmeDegreetypesplit = programmeDegreetypeStr.split("-")
            print(programmeDegreetypesplit)
            if len(programmeDegreetypesplit) > 1:
                degreetype = programmeDegreetypesplit[-1]
                # print(degreetype)
                item['degree_name'] = degreetype.strip()
                programme = programmeDegreetypesplit[0].strip()
                # print(programme)
                item['programme_en'] = ''.join(programme)
            else:
                programme = programmeDegreetypesplit[0]
                # print(programme)
                item['programme_en'] = item['degree_name'] = ''.join(programme).strip()

            if item['degree_name'] == "" and "mba" in item['programme_en'].lower():
                item['degree_name'] = "MBA"
            elif item['degree_name'] == "" and "MA " in item['programme_en']:
                item['degree_name'] = "MA"
                item['programme_en'] = item['programme_en'].replace("MA", "").strip()
            print("item['degree_name']: ", item['degree_name'])
            print("item['programme_en']: ", item['programme_en'])

            department = response.xpath(
                "//div[@class='crumbcontainer']/span/span[1]/a[1]//text()").extract()
            clear_space(department)
            item['department'] = ''.join(department)
            print("item['department'] = ", item['department'])

            duration = response.xpath(
                "//strong[contains(text(),'Course Length:')]/..//text()").extract()
            clear_space(duration)
            # print("duration: ", duration)
            item['teach_time'] = getTeachTime(''.join(duration))
            duration_list = getIntDuration(''.join(duration))
            # print("duration_list: ", duration_list)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['teach_time'] = ", item['teach_time'])
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # //div[@id='ordercontainer']/span[@id='DeltaPlaceHolderMain']/div[@class='coursefullwidth']/div[@class='rightcontainer']/div[@class='coursefacts']/div/div//p
            overview = response.xpath(
                "//div[@id='ordercontainer']/span[@id='DeltaPlaceHolderMain']/div[@class='coursefullwidth']/div[@class='coursecontentarea']/div[@class='courseoverview']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            modules_en = response.xpath(
                "//h3[contains(text(),'Course Content')]/following-sibling::div[1]").extract()
            if len(modules_en) == 0:
                modules_en = response.xpath("//h3[contains(text(),'Course content')]/..").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules_en))
            if item['modules_en'] != "":
                item['modules_en'] = "<h3>Course Content</h3>" + item['modules_en']
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath(
                "//h3[contains(text(),'Learning & Teaching')]/following-sibling::div[1]").extract()
            item['assessment_en'] = remove_class(clear_lianxu_space(assessment_en))
            if item['assessment_en'] != "":
                item['assessment_en'] = "<h3>Learning & Teaching</h3>" + item['assessment_en']

            assessment_en1 = response.xpath("//h3[contains(text(),'Assessment')]/following-sibling::div[1]").extract()
            # print(len(assessment_en1))
            if len(assessment_en1) != 0:
                item['assessment_en'] += "<h3>Assessment</h3>" + remove_class(clear_lianxu_space(assessment_en1))
            # print("item['assessment_en']: ", item['assessment_en'])

            career_en = response.xpath(
                "//h3[contains(text(),'Employability & Careers')]/following-sibling::div[1]").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            if item['career_en'] != "":
                item['career_en'] = "<h3>Employability & Careers</h3>" + item['career_en']
            # print("item['career_en']: ", item['career_en'])

            rntry_requirements = response.xpath(
                "//h3[contains(text(),'Entry Requirements & How to Apply')]/following-sibling::div[1]//strong[contains(text(),'How to Apply:')]/../preceding-sibling::*//text()|//div[@id='ui-accordion-accordion-panel-1']//text()").extract()
            if len(rntry_requirements) == 0:
                rntry_requirements = response.xpath(
                    "//h3[contains(text(),'Entry Requirements')]/following-sibling::div[1]//text()").extract()
                if len(rntry_requirements) == 0:
                    rntry_requirements = response.xpath(
                        "//h3[contains(text(),'Entry Requirements & How to Apply')]/following-sibling::div[1]//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(rntry_requirements)
            if item['rntry_requirements'] != "":
                item['rntry_requirements'] = "Entry Requirements " + item['rntry_requirements'].strip()
            # print("item['rntry_requirements']: ", item['rntry_requirements'])


            ielts = re.findall(r"IELTS.{1,80}", item['rntry_requirements'])
            clear_space(ielts)
            # print("ielts: ", ielts)
            if len(ielts) > 0:
                item['ielts_desc'] = ielts[0]
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            # if len(ielts_list) == 1:
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            apply_proces_en = ["""<p style="text-align: center;"> 
      <strong><span style="color: #d44831;">Application Stages</span>  </strong></p>
   <p> 
      <strong>1. </strong>Choose<strong> programme of study</strong><strong>.</strong></p>
   <p> 
      <strong>2.</strong>&#160;Check if the programme has a 
      <strong>specific deadline for applications</strong>.</p>
   <p> 
      <strong>3.</strong>&#160;It is <strong>essential</strong> that&#160;you check the 
      <strong>Compulsory Supporting Documents</strong>. Some programmes require specific information e.g. a Personal Statement template and/or additional information.</p>
   <p> 
      <strong>4.</strong> If applying for 
      <strong>RPL</strong> please see information under <em>Make an Online Application</em>.</p>
   <p> 
      <strong>5.</strong> Make 
      online application, ensuring <strong>all </strong><strong>required documents are uploaded</strong>.</p>
   <p> 
      <strong>6.</strong> Applications can take 
      <strong>2-4 weeks </strong>for&#160;consideration.</p> 
   <p> 
      <strong>7. </strong>If required you will be invited for<strong>&#160;interview</strong>.&#160;</p>
   <p> 
      <strong>8.</strong>&#160;If successful, o<span style="font-size: 12.825px; background-color: #f0eded;">fficial 
         <strong>offer letter</strong>&#160;will be&#160;provided by Admissions.</span></p>
   <p><strong>9.</strong>&#160;<strong>Accept&#160;place on programme.</strong>&#160;</p><p><strong>10.</strong>&#160;<span style="font-size: 12.825px; background-color: #f0eded;">Once any conditions have been met (if applicable)&#160;you will be provided&#160;with <strong>joining and&#160;</strong></span><span style="font-size: 12.825px; background-color: #f0eded;"><strong>enrolment&#160;</strong></span><span style="font-size: 12.825px; background-color: #f0eded;"><strong>information </strong>a month prior&#160;to programme&#160;commencement. <strong>It is essential that you accept your place to enable you to receive your joining information (status&#160;Unconditional Firm).</strong></span></p>
"""]
            item['apply_proces_en'] = remove_class(clear_lianxu_space(apply_proces_en)).strip()
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            interview_desc_en = response.xpath(
                "//*[contains(text(),'interview')]").extract()
            item['interview_desc_en'] = remove_class(clear_lianxu_space(interview_desc_en)).strip()
            # print("item['interview_desc_en']: ", item['interview_desc_en'])

            # http://www.cardiffmet.ac.uk/study/finance/Documents/2018-2019-Fee-Tables.pdf
            item['tuition_fee'] = '13000'
            if "mba" in item['programme_en'].lower():
                item['tuition_fee'] = '13500'
            item['tuition_fee_pre'] = "£"

            item['require_chinese_en'] = remove_class(clear_lianxu_space(["<h4>Postgraduate Courses</h4><p>Students with a Bachelor degree from a recognised institution in China or UK. </p><p>OR students with a Masters degree from a recognised institution in China</p>"]))
            yield item
        except Exception as e:
            with open(item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #13

Показать файл

Файл: NottinghamTrentUniversity_P.py Проект: histudent/python_spider

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.ntu.ac.uk/"
        item['university'] = "Nottingham Trent University"
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===============================")
        print(response.url)
        try:
            # 专业、学位类型
            programme = response.xpath(
                "//h1[@class='course-heading page-heading']//text()").extract(
                )
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en'] = ", item['programme_en'])

            degree_type = response.xpath(
                "//h2[@class='js_qualification']/strong//text()").extract()
            item['degree_name'] = ''.join(degree_type)
            print("item['degree_name'] = ", item['degree_name'])

            #
            mode = response.xpath(
                "//strong[contains(text(),'Study mode(s):')]/following-sibling::span//text()"
            ).extract()
            clear_space(mode)
            # print("mode: ", mode)
            item['teach_time'] = getTeachTime(''.join(mode))
            # print("item['teach_time'] = ", item['teach_time'])

            # //div[@id='tabs-key-info']/div[@class='tab tab-1 active-tab']/p[3]/span
            location = response.xpath(
                "//span[@class='location save']//text()").extract()
            item['location'] = ''.join(location)
            # print("item['location'] = ", item['location'])

            start_date = response.xpath(
                "//strong[contains(text(),'Starting:')]/following-sibling::span//text()"
            ).extract()
            # print(start_date)
            item['start_date'] = ''.join(start_date)
            # print("item['start_date'] = ", item['start_date'])
            item['start_date'] = getStartDate(item['start_date'])
            # print("item['start_date']1 = ", item['start_date'])

            duration = response.xpath(
                "//strong[contains(text(),'Course duration:')]/following-sibling::span//text()"
            ).extract()
            # print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            # print("duration_list: ", duration_list)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # //html//div[@class='content']/div[1]/div  专业描述
            overview = response.xpath(
                "//div[@id='what-you-will-study']/preceding-sibling::*"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en'] = ", item['overview_en'])

            # modules   课程设置
            modules = response.xpath(
                "//div[@id='what-you-will-study']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en'] = ", item['modules_en'])

            # teaching_assessment   评估方式
            teaching_assessment = response.xpath(
                "//div[@id='how-youre-taught']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(teaching_assessment))
            # print("item['assessment_en'] = ", item['assessment_en'])

            # career   评估方式
            career = response.xpath(
                "//div[@id='careers-and-employability']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en'] = ", item['career_en'])

            # //div[@id='entry-requirements-1']
            entry_requirements = response.xpath(
                "//div[@id='entry-requirements-0']//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            print("item['rntry_requirements'] = ", item['rntry_requirements'])

            # //div[@id='entry-requirements-1']
            how_to_apply = response.xpath(
                "//div[@id='how-to-apply-1']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en'] = ", item['apply_proces_en'])

            # //div[@id='how-to-apply-1']//h3[contains(text(),'Interview')]/following-sibling::p[position()<3]
            interview_desc_en = response.xpath(
                "//div[@id='how-to-apply-1']//h3[contains(text(),'Interview')]/following-sibling::p[position()<3]"
            ).extract()
            item['interview_desc_en'] = remove_class(
                clear_lianxu_space(interview_desc_en))
            # print("item['interview_desc_en'] = ", item['interview_desc_en'])

            # deadline
            deadline = response.xpath(
                "//div[@id='how-to-apply-1']//p//strong[contains(text(),'Application closing date')]/../following-sibling::p[1]//text()|//div[@id='how-to-apply-1']//h3[contains(text(),'Application deadline')]/following-sibling::p[1]//text()"
            ).extract()
            clear_space(deadline)
            # print("deadline: ", deadline)
            deadline_str = ''.join(deadline)
            item['deadline'] = getStartDate(deadline_str)
            # print("item['deadline'] = ", item['deadline'])

            # //html//div[@id='fees-and-funding-1']//tr/td[3]
            tuition_fee = response.xpath(
                "//html//div[@id='fees-and-funding-1']//*[contains(text(),'Full')]/..//text()|"
                "//html//div[@id='fees-and-funding-1']//*[contains(text(),'full')]/../..//text()"
            ).extract()
            if len(tuition_fee) == 0:
                tuition_fee = response.xpath(
                    "//html//div[@id='fees-and-funding-1']//text()").extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            tuition_fee = ''.join(tuition_fee)
            tuition_fee = getTuition_fee(tuition_fee)
            item['tuition_fee'] = tuition_fee
            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
            else:
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee'] = ", item['tuition_fee'])
            # print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])

            departmentDict = {
                "MSc / MRes Animal Health and Welfare":
                "School of Animal, Rural and Environmental Sciences",
                "MRes Applied Anthrozoology":
                "School of Animal, Rural and Environmental Sciences",
                "MSc / MRes Biodiversity Conservation":
                "School of Animal, Rural and Environmental Sciences",
                "MSc / MRes Endangered Species Recovery and Conservation":
                "School of Animal, Rural and Environmental Sciences",
                "MRes Equine Health and Welfare":
                "School of Animal, Rural and Environmental Sciences",
                "MRes Equine Performance":
                "School of Animal, Rural and Environmental Sciences",
                "MSc Equine Performance, Health and Welfare":
                "School of Animal, Rural and Environmental Sciences",
                "MSc / MRes Global Food Security and Development":
                "School of Animal, Rural and Environmental Sciences",
                "Architecture - Professional Certificate in":
                "School of Architecture, Design and the Built Environment",
                "Architecture (ARB/RIBA Part 2) - MArch":
                "School of Architecture, Design and the Built Environment",
                "Building Surveying - MSc":
                "School of Architecture, Design and the Built Environment",
                "Civil Engineering - MSc":
                "School of Architecture, Design and the Built Environment",
                "Construction Management - MSc":
                "School of Architecture, Design and the Built Environment",
                "Construction Project Management (Online) - MSc":
                "School of Architecture, Design and the Built Environment",
                "Interior Architecture and Design - MA":
                "School of Architecture, Design and the Built Environment",
                "International Real Estate Investment and Finance - MSc":
                "School of Architecture, Design and the Built Environment",
                "Planning and Development - MSc":
                "School of Architecture, Design and the Built Environment",
                "Project Management (Construction) - MSc":
                "School of Architecture, Design and the Built Environment",
                "Quantity Surveying - MSc":
                "School of Architecture, Design and the Built Environment",
                "Real Estate - MSc":
                "School of Architecture, Design and the Built Environment",
                "Structural Engineering with Management - MSc":
                "School of Architecture, Design and the Built Environment",
                "Structural Engineering with Materials - MSc":
                "School of Architecture, Design and the Built Environment",
                "MA Animation":
                "School of Art & Design",
                "MA Commercial Photography":
                "School of Art & Design",
                "MA Culture, Style and Fashion":
                "School of Art & Design",
                "MA Branding and Identity":
                "School of Art & Design",
                "MA Fashion Communications":
                "School of Art & Design",
                "MA Fashion Design":
                "School of Art & Design",
                "MA Fashion Knitwear Design":
                "School of Art & Design",
                "MA Fashion Marketing":
                "School of Art & Design",
                "MFA Fine Art":
                "School of Art & Design",
                "MA Graphic Design":
                "School of Art & Design",
                "MA Illustration":
                "School of Art & Design",
                "MA International Fashion Management":
                "School of Art & Design",
                "MA Luxury Fashion Brand Management":
                "School of Art & Design",
                "MA Photography":
                "School of Art & Design",
                "MA Textile Design Innovation":
                "School of Art & Design",
                "MA Culture, Style and Fashion":
                "School of Art & Design",
                "MA Fashion Communications":
                "School of Art & Design",
                "MA Fashion Marketing":
                "School of Art & Design",
                "MA Fashion and Textile Design":
                "School of Art & Design",
                "MFA Fine Art":
                "School of Art & Design",
                "MA Graphic Design Theory and Practice":
                "School of Art & Design",
                "MA International Fashion Management":
                "School of Art & Design",
                "MA Luxury Fashion Brand Management":
                "School of Art & Design",
                "MA Photography":
                "School of Art & Design",
                "PG Cert Creative Pattern Cutting (15 weeks)":
                "School of Art & Design",
                "Art and Design Professional Doctorate":
                "School of Art & Design",
                "Art and Design PhD / MPhil":
                "School of Art & Design",
                "MA / PGDip Broadcast Journalism":
                "School of Arts and Humanities",
                "MA / PGDip Digital and Newspaper Journalism":
                "School of Arts and Humanities",
                "MA / PGDip Magazine Journalism":
                "School of Arts and Humanities",
                "MA / PGDip Documentary Journalism":
                "School of Arts and Humanities",
                "MA Media and Globalisation":
                "School of Arts and Humanities",
                "MA Creative Writing":
                "School of Arts and Humanities",
                "MRes English Literary Research":
                "School of Arts and Humanities",
                "MA Linguistics (by research)":
                "School of Arts and Humanities",
                "MA Philosophy (by research)":
                "School of Arts and Humanities",
                "MA History":
                "School of Arts and Humanities",
                "MA/ PGDip/ PGCert Museum and Heritage Development":
                "School of Arts and Humanities",
                "MA Holocaust and Genocide (by research)":
                "School of Arts and Humanities",
                "MA International Development":
                "School of Arts and Humanities",
                "MA English Language Teaching":
                "School of Arts and Humanities",
                "MA TESOL (Teaching English to Speakers of Other Languages)":
                "School of Arts and Humanities",
                "MSc Management":
                "Nottingham Business School",
                "MSc Management and Finance":
                "Nottingham Business School",
                "MSc Management and Global Supply Chain Management":
                "Nottingham Business School",
                "MSc Management and Innovation and Enterprise":
                "Nottingham Business School",
                "MSc Management and International Business":
                "Nottingham Business School",
                "MSc Management and Marketing":
                "Nottingham Business School",
                "MSc Marketing":
                "Nottingham Business School",
                "MSc Branding and Advertising":
                "Nottingham Business School",
                "MSc Digital Marketing":
                "Nottingham Business School",
                "MSc Management and Marketing":
                "Nottingham Business School",
                "fees, funding and scholarships":
                "Nottingham Business School",
                "Return to all courses":
                "Nottingham Business School",
                "MSc Human resource Management (full-time)":
                "Nottingham Business School",
                "MSc Economics":
                "Nottingham Business School",
                "MSc Economics and Investment Banking":
                "Nottingham Business School",
                "MSc International Business":
                "Nottingham Business School",
                "MSc International Business (Dual Award) ":
                "Nottingham Business School",
                "MSc Management and International Business":
                "Nottingham Business School",
                "MSc Management and International Publishing":
                "Nottingham Business School",
                "MSc Management and Global Supply Chain Management":
                "Nottingham Business School",
                "MSc Finance":
                "Nottingham Business School",
                "MSc Finance and Accounting":
                "Nottingham Business School",
                "MSc Finance and Investment Banking":
                "Nottingham Business School",
                "MSc Management and Finance":
                "Nottingham Business School",
                "MSc Economics and Investment Banking":
                "Nottingham Business School",
                "MSc Entrepreneurship":
                "Nottingham Business School",
                "MSc Project Management":
                "Nottingham Business School",
                "MSc Management":
                "Nottingham Business School",
                "MSc Management and International Business":
                "Nottingham Business School",
                "MSc Marketing":
                "Nottingham Business School",
                "MSc Branding and Advertising":
                "Nottingham Business School",
                "MSc Finance":
                "Nottingham Business School",
                "MSc International Business":
                "Nottingham Business School",
                "Assessment Only Route to QTS (Primary) - Non-NTU Award":
                "Nottingham Institute of Education",
                "Assessment Only Route to QTS (Secondary) - Non-NTU Award":
                "Nottingham Institute of Education",
                "Early Years Initial Teacher Training - PGCE":
                "Nottingham Institute of Education",
                "Early Years Initial Teacher Training (Assessment Only) - Non-NTU Award":
                "Nottingham Institute of Education",
                "Education, full-time - MA / PGCert / PGDip":
                "Nottingham Institute of Education",
                "Education, Part-time - MA / PGCert / PGDip":
                "Nottingham Institute of Education",
                "English Language Teaching - MA":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training - Cert Ed / PGCE / ProfGCE in":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training (with English and Literacy) - Cert Ed / PGCE / ProfGCE in":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training (with Mathematics and Numeracy) - Cert Ed / PGCE / ProfGCE in":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training (with Science, Engineering and Technology) - Cert Ed / PGCE / ProfGCE in":
                "Nottingham Institute of Education",
                "Post-Compulsory Education and Training (with Special and Inclusive Practice) - Cert Ed / PGCE / ProfGCE in":
                "Nottingham Institute of Education",
                "Primary Education - PGCE":
                "Nottingham Institute of Education",
                "Primary: School-Centred Initial Teacher Training (SCITT) - PGCE":
                "Nottingham Institute of Education",
                "School Direct Training Programme (Primary salaried) - PGCE":
                "Nottingham Institute of Education",
                "School Direct Training Programme (Primary) - PGCE":
                "Nottingham Institute of Education",
                "School Direct Training Programme (Secondary salaried) - PGCE":
                "Nottingham Institute of Education",
                "School Direct Training Programme (Secondary) - PGCE":
                "Nottingham Institute of Education",
                "Secondary Biology - PGCE":
                "Nottingham Institute of Education",
                "Secondary Business Education - PGCE":
                "Nottingham Institute of Education",
                "Secondary Chemistry - PGCE":
                "Nottingham Institute of Education",
                "Secondary Computer Science with ICT - PGCE":
                "Nottingham Institute of Education",
                "Secondary Education (Design and Technology) - PGCE":
                "Nottingham Institute of Education",
                "Secondary Education (Physics) - PGCE":
                "Nottingham Institute of Education",
                "Secondary English - PGCE":
                "Nottingham Institute of Education",
                "Secondary Mathematics - PGCE":
                "Nottingham Institute of Education",
                "Secondary Music - PGCE":
                "Nottingham Institute of Education",
                "Special Educational Needs Coordination - National Award":
                "Nottingham Institute of Education",
                "Teaching English to Speakers of Other Languages (TESOL) - MA":
                "Nottingham Institute of Education",
                "Corporate and Insolvency Law":
                "Nottingham Law School",
                "Dual LLM in Corporate and Insolvency Law / European and Insolvency Law":
                "Nottingham Law School",
                "General Law":
                "Nottingham Law School",
                "Health Law and Ethics":
                "Nottingham Law School",
                "Human Rights and Justice":
                "Nottingham Law School",
                "Intellectual Property Law":
                "Nottingham Law School",
                "International Financial Law":
                "Nottingham Law School",
                "International Trade and Commercial Law":
                "Nottingham Law School",
                "Oil, Gas and Mining Law":
                "Nottingham Law School",
                "Sports Law":
                "Nottingham Law School",
                "Corporate and Insolvency Law":
                "Nottingham Law School",
                "International Trade and Commercial Law":
                "Nottingham Law School",
                "Legal Practice":
                "Nottingham Law School",
                "Oil, Gas and Mining Law":
                "Nottingham Law School",
                "MSc Biomedical Science":
                "School of Science and Technology",
                "MSc Biomedical Science (Flexible Learning)":
                "School of Science and Technology",
                "MSc Neuropharmacology":
                "School of Science and Technology",
                "MSc Pharmacology":
                "School of Science and Technology",
                "MSc Molecular Microbiology":
                "School of Science and Technology",
                "MSc Biotechnology":
                "School of Science and Technology",
                "MSc Molecular Cell Biology":
                "School of Science and Technology",
                "MSc by Research Environmental Management":
                "School of Science and Technology",
                "MRes Biotechnology":
                "School of Science and Technology",
                "MRes Cancer Biology":
                "School of Science and Technology",
                "MRes Cell Biology":
                "School of Science and Technology",
                "MRes Molecular Biology":
                "School of Science and Technology",
                "MRes Molecular Microbiology":
                "School of Science and Technology",
                "MRes Neuropharmacology":
                "School of Science and Technology",
                "MRes Pharmacology":
                "School of Science and Technology",
                "MRes Environmental Management":
                "School of Science and Technology",
                "MSc Biomedical Science (Flexible Learning)":
                "School of Science and Technology",
                "MRes Environmental Management":
                "School of Science and Technology",
                "MSc Chemistry / MSc Chemistry (Professional Practice)":
                "School of Science and Technology",
                "MRes Pharmaceutical and Medicinal Science":
                "School of Science and Technology",
                "MRes Pharmaceutical Analysis":
                "School of Science and Technology",
                "MRes Analytical Chemistry":
                "School of Science and Technology",
                "MRes Chemistry":
                "School of Science and Technology",
                "MRes Advanced Materials Engineering":
                "School of Science and Technology",
                "MSc Forensic Science":
                "School of Science and Technology",
                "MSc Computer Science":
                "School of Science and Technology",
                "MSc Cloud and Enterprise Computing":
                "School of Science and Technology",
                "MSc IT Security":
                "School of Science and Technology",
                "MSc Engineering (Electronics)":
                "School of Science and Technology",
                "MSc Engineering (Cybernetics and Communications)":
                "School of Science and Technology",
                "MSc Engineering Management":
                "School of Science and Technology",
                "MSc Computing Systems":
                "School of Science and Technology",
                "MSc Data Analytics for Business":
                "School of Science and Technology",
                "MRes Computer Science":
                "School of Science and Technology",
                "MRes Electronic Systems":
                "School of Science and Technology",
                "Online MBA with Data Analytics":
                "School of Science and Technology",
                "MRes Mathematical Sciences":
                "School of Science and Technology",
                "MSc Data Analytics for Business":
                "School of Science and Technology",
                "Online MBA with Data Analytics":
                "School of Science and Technology",
                "MRes Medical and Materials Imaging":
                "School of Science and Technology",
                "MRes Medical Imaging":
                "School of Science and Technology",
                "MSc Physics":
                "School of Science and Technology",
                "MRes Physics":
                "School of Science and Technology",
                "MRes Sport Science":
                "School of Science and Technology",
                "MRes Exercise Physiology":
                "School of Science and Technology",
                "MRes Performance Nutrition":
                "School of Science and Technology",
                "MRes Performance Analysis":
                "School of Science and Technology",
                "MRes Biomechanics":
                "School of Science and Technology",
                "MRes Sport and Exercise Psychology":
                "School of Science and Technology",
                "MSc / PGDip Psychology":
                "School of Social Sciences",
                "MSc Applied Child Psychology":
                "School of Social Sciences",
                "MRes / MSc Psychological Research Methods":
                "School of Social Sciences",
                "MSc Forensic Mental Health":
                "School of Social Sciences",
                "MSc Forensic Psychology (BPS accredited)":
                "School of Social Sciences",
                "MSc Cyberpsychology":
                "School of Social Sciences",
                "MSc Psychology in Clinical Practice":
                "School of Social Sciences",
                "MSc Psychological Wellbeing and Mental Health":
                "School of Social Sciences",
                "MA Criminology":
                "School of Social Sciences",
                "MA Sociology":
                "School of Social Sciences",
                "MA Politics":
                "School of Social Sciences",
                "MA International Relations":
                "School of Social Sciences",
                "Online MA International Relations (Distance learning)":
                "School of Social Sciences",
                "MA Public Health":
                "School of Social Sciences",
                "PG Cert / MA Career Development":
                "School of Social Sciences",
                "MA Social Work (January 2019 entry)":
                "School of Social Sciences",
            }
            departmentKey = item['degree_name'] + " " + item['programme_en']
            departmentKey1 = item['programme_en'] + " - " + item['degree_name']
            # print("departmentKey = ", departmentKey)
            # print("departmentKey1 = ", departmentKey1)
            item['department'] = departmentDict.get(departmentKey)
            if item['department'] == None:
                item['department'] = departmentDict.get(departmentKey1)
                if item['department'] == None:
                    item['department'] = departmentDict.get(
                        item['programme_en'])
                    if item['department'] == None:
                        item['department'] = departmentDict.get(
                            item['programme_en'].replace(" ", " "))
            # print("item['department'] = ", item['department'])

            if item['department'] == "School of Art & Design" or item[
                    'department'] == "School of Animal, Rural and Environmental Sciences" or item[
                        'department'] == "School of Science and Technology":
                item['ielts'] = 6.5
                item['ielts_l'] = 5.5
                item['ielts_s'] = 5.5
                item['ielts_r'] = 5.5
                item['ielts_w'] = 5.5
            elif item['department'] == "Nottingham Business School":
                item[
                    'ielts_desc'] = """For 1-year Masters you'll need a 6.5 grade overall with minimum of 5.5 in each component.
For 2-year Masters you'll need a 6.5 grade overall with a minimum of 6.0 in speaking and listening, and 5.5 in reading and writing."""
            elif item['department'] == "School of Architecture, Design and the Built Environment" or item[
                    'department'] == "School of Arts and Humanities" or item[
                        'department'] == "Nottingham Institute of Education" or item[
                            'department'] == "Nottingham Law School" or item[
                                'department'] == "School of Social Sciences" or item[
                                    'department'] == "School of Art & Design":
                item['ielts'] = 6.5
                item['ielts_l'] = 5.5
                item['ielts_s'] = 5.5
                item['ielts_r'] = 5.5
                item['ielts_w'] = 5.5
            # print("item['IELTS'] = %s item['IELTS_L'] = %s item['IELTS_S'] = %s item['IELTS_R'] = %s item['IELTS_W'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            entry_requirements = response.xpath(
                "//div[@id='entry-requirements-1']//text()").extract()
            entry_requirementsStr = ''.join(entry_requirements)
            ielts = re.findall(r"IELTS.{1,200}", entry_requirementsStr)
            item['ielts_desc'] = ''.join(ielts)
            print("item['ielts_desc']: ", item['ielts_desc'])

            if item['ielts'] == None:
                ieltsDict = get_ielts(''.join(ielts))
                item['ielts'] = ieltsDict.get("IELTS")
                item['ielts_l'] = ieltsDict.get("IELTS_L")
                item['ielts_s'] = ieltsDict.get("IELTS_S")
                item['ielts_r'] = ieltsDict.get("IELTS_R")
                item['ielts_w'] = ieltsDict.get("IELTS_W")
            print(
                "item['IELTS'] = %sitem['IELTS_L'] = %sitem['IELTS_S'] = %sitem['IELTS_R'] = %sitem['IELTS_W'] = %s=="
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            item[
                'require_chinese_en'] = """<div>Bachelors degree (four years or six years in medicine / dentistry) from recognised institution in China. 
Grades of 75% or above
Grades of 70% or above from 211 universities</div>"""
            yield item
        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #14

Показать файл

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.port.ac.uk/"
        item['university'] = "University of Portsmouth"
        item['url'] = response.url
        item['teach_type'] = 'phd'
        # 学位类型
        item['degree_type'] = 3
        item[
            'location'] = 'University House, Winston Churchill Avenue, Portsmouth PO1 2UP'
        print("===========================")
        print(response.url)
        try:
            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            programme = response.xpath(
                "//div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1//text()|"
                "//div[@class='onscreen-area']/div/div[@class='section'][1]/div[@class='page-title above-page-nav course-page-title']/div[@class='wrap']/h1//text()"
            ).extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            degree_type = response.xpath(
                "//div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/span//text()|"
                "//div[@class='onscreen-area']/div/div[@class='section'][1]/div[@class='page-title above-page-nav course-page-title']/div[@class='wrap']/p//text()"
            ).extract()
            item['degree_name'] = ''.join(degree_type).strip()
            print("item['degree_name']: ", item['degree_name'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            department = response.xpath(
                "//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()|//strong[contains(text(), 'Department')]/following-sibling::a//text()|"
                "//strong[contains(text(), 'Department')]/../following-sibling::p//text()|"
                "//span[contains(text(), 'Department')]/../following-sibling::*[1]//text()"
            ).extract()
            clear_space(department)
            # print(department)
            if len(department) > 0:
                item['department'] = department[0].strip()
                if item['department'] == "This course is eligible for the":
                    item['department'] = department[-1].strip()
            # print("item['department']: ", item['department'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            duration = response.xpath(
                "//dt[contains(text(), 'Duration')]/following-sibling::dd[1]//text()|//dt[contains(text(), 'duration')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_str = ''.join(duration)

            item['teach_time'] = getTeachTime(duration_str)
            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['teach_time'] = ", item['teach_time'])
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # //strong[contains(text(),'International students')]/../following-sibling::p[1]
            tuition_fee = response.xpath(
                "//strong[contains(text(),'International students')]/../following-sibling::p//text()|"
                "//strong[contains(text(),'2018/19 entry')]/../following-sibling::p[1]//text()|"
                "//dt[contains(text(),'Fees')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(
                r"Full\stime:\s£\d+,\d+|Full\stime\s£\d+,\d+|International\sfull-time\sstudents:\s£\d+,\d+",
                ''.join(tuition_fee))
            # print("tuition_fee_re: ", tuition_fee_re)
            tuition_fee_re1 = re.findall(r"\d+,\d+", ''.join(tuition_fee_re))
            if len(tuition_fee_re1) > 0:
                item['tuition_fee'] = int(tuition_fee_re1[0].replace(
                    ",", "").replace("£", "").strip())
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            rntry_requirements_content = response.xpath(
                "//h3[contains(text(),'Key Facts')]/..//text()").extract()
            clear_space(rntry_requirements_content)
            # print("rntry_requirements_content: ", rntry_requirements_content)
            if "2018 ENTRY REQUIREMENTS" in rntry_requirements_content:
                rntry_requirements_index = rntry_requirements_content.index(
                    "2018 ENTRY REQUIREMENTS")
                if "Fees" in rntry_requirements_content:
                    rntry_requirements_indexEnd = rntry_requirements_content.index(
                        "Fees")
                    item['rntry_requirements'] = clear_lianxu_space(
                        rntry_requirements_content[rntry_requirements_index:
                                                   rntry_requirements_indexEnd]
                    )
            if "2018 entry requirements" in rntry_requirements_content:
                rntry_requirements_index = rntry_requirements_content.index(
                    "2018 entry requirements")
                if "Fees" in rntry_requirements_content:
                    rntry_requirements_indexEnd = rntry_requirements_content.index(
                        "Fees")
                    item['rntry_requirements'] = clear_lianxu_space(
                        rntry_requirements_content[rntry_requirements_index:
                                                   rntry_requirements_indexEnd]
                    )
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            if item['rntry_requirements'] == "":
                # //dt[contains(text(),'Entry')]/following-sibling::dd[1]
                rntry_requirements = response.xpath(
                    "//dt[contains(text(),'Entry')]/following-sibling::dd[1]//text()"
                ).extract()
                item['rntry_requirements'] = clear_lianxu_space(
                    rntry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ieltsList = re.findall(r".{1,45}IELTS.{1,45}",
                                   item['rntry_requirements'])
            item['ielts_desc'] = ''.join(ieltsList).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            overview = response.xpath(
                "//h3[contains(text(),'Why take this course?')]/../*[not(@class='blockquote-img')]|"
                "//div[@class='onscreen-area']/div/div[@class='section'][1]/div[@class='wrap']/div[@class='group third']/div[@class='column twothirds']"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            modules = response.xpath(
                "//h3[@id='structure']/../../following-sibling::div[1]|"
                "//div[@class='onscreen-area']/div/div[@class='section slate dark']/div[@class='wrap']"
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            teaching_assessment = response.xpath(
                "//div[@class='pure-g purple content']/div[1]/div[@class='box']"
            ).extract()
            if len(teaching_assessment) == 0:
                teaching_assessment = response.xpath(
                    "//h3[contains(text(), 'Teaching')]/preceding-sibling::*[1]/following-sibling::*[position()<3]"
                ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(teaching_assessment))
            # print("item['assessment_en']: ", item['assessment_en'])

            career = response.xpath(
                "//div[@class='box container content pure-g']|//div[@class='onscreen-area']/div/div[@class='section teal dark']/div[@class='wrap']"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            print("item['career_en']: ", item['career_en'])

            item[
                'apply_proces_en'] = "http://www.port.ac.uk/application-fees-and-funding/applying-postgraduate/#mastersCourses"
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #15

Показать файл

Файл: StGeorge's,UniversityOfLondon_P.py Проект: histudent/python_spider

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "St George's, University of London"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        item['location'] = "Cranmer Terrace, London SW17 0RE"
        print("===========================")
        print(response.url)
        try:
            programmeDegree_name = response.xpath("//div[@class='inner']/h1//text()").extract()
            programmeDegree_nameStr = ''.join(programmeDegree_name).strip()
            # print("programmeDegree_nameStr: ", programmeDegree_nameStr)

            degree_name = re.findall(r"\w+/\w+/\w+|\w+\s\(|\w+\s-", programmeDegree_nameStr)
            if len(degree_name) == 0:
                degree_name = re.findall(r"\w+$", programmeDegree_nameStr)
            item['degree_name'] = ''.join(degree_name).replace("(", "").replace("-", "").strip()
            print("item['degree_name']: ", item['degree_name'])

            programme = programmeDegree_nameStr.replace(item['degree_name'], "").strip()
            item['programme_en'] = programme
            print("item['programme_en']: ", item['programme_en'])


            # start_date = response.xpath("//dt[contains(text(), 'Start date')]/following-sibling::dd[1]//text()").extract()
            # clear_space(start_date)
            # # print("start_date: ", start_date)
            # item['start_date'] = getStartDate(''.join(start_date))
            # # print("item['start_date']: ", item['start_date'])

            duration = response.xpath("//table[1]/tbody/tr[1]/td[2]//text()").extract()
            clear_space(duration)
            # print("duration: ", ''.join(duration))
            item['teach_time'] = getTeachTime(''.join(duration))
            # print("item['teach_time']: ", item['teach_time'])

            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            # //p[contains(text(),'Non-UK/EU (International) application deadline')]
            deadline = response.xpath(
                "//p[contains(text(),'Non-UK/EU (International) application deadline')]//text()").extract()
            clear_space(deadline)
            # print("deadline: ", deadline)
            item['deadline'] = getStartDate(''.join(deadline).replace("Non-UK/EU (International) application deadline", "").replace(":", "").strip())
            # print("item['deadline']: ", item['deadline'])

            # location = response.xpath("//*[contains(text(),'Study location:')]//text()").extract()
            # item['location'] = ''.join(location).replace("Study location:", "").strip()
            # print("item['location']: ", item['location'])

            tuition_fee = response.xpath("//p[contains(text(),'Non-EU (international): ')]//text()|"
                                         "//table//p[contains(text(),'2018 entry Non-EU')]//text()|"
                                         "//table[2]/tbody/tr[4]/td/p[contains(text(),'2018 Non-EU')]/following-sibling::*/*[1]//text()|"
                                         "//table//p[contains(text(),'2018 Non-EU')]/following-sibling::*[1]/*[1]//text()").extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", ''.join(tuition_fee))
            tuition_fee_re = re.findall(r"\d+,\d+", ''.join(tuition_fee))
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee_re))
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            overview_en = response.xpath("//p[@class='first']|//table[1]/following-sibling::*[position()<last()-1]").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en)).replace("<p><button>Make an enquiry</button></p>", "").strip()
            # print("item['overview_en']: ", item['overview_en'])

            entry_url = response.xpath("//a[contains(text(),'Entry')]/@href").extract()
            # print("entry_url: ", entry_url)
            if len(entry_url) != 0:
                parse_entry_url = "https://www.sgul.ac.uk" + entry_url[0]
                # print("parse_entry_url: ", parse_entry_url)
                entry_dict = self.parse_rntry_requirements(parse_entry_url)
                # print(entry_dict)
                item['rntry_requirements'] = entry_dict.get('rntry_requirements')

                item['ielts_desc'] = entry_dict.get('ielts_desc')
            # print("item['rntry_requirements']: ", item['rntry_requirements'])
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_list = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
            if len(ielts_list) == 1:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[0]
                item['ielts_s'] = ielts_list[0]
                item['ielts_r'] = ielts_list[0]
                item['ielts_w'] = ielts_list[0]
            elif len(ielts_list) == 2:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[1]
                item['ielts_s'] = ielts_list[1]
                item['ielts_r'] = ielts_list[1]
                item['ielts_w'] = ielts_list[1]
            elif len(ielts_list) == 5:
                item['ielts'] = ielts_list[0]
                item['ielts_l'] = ielts_list[1]
                item['ielts_s'] = ielts_list[4]
                item['ielts_r'] = ielts_list[2]
                item['ielts_w'] = ielts_list[3]
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            modules_url = response.xpath("//a[contains(text(),'Module')]/@href").extract()
            # print("modules_url: ", modules_url)
            if len(modules_url) != 0:
                parse_modules_url = "https://www.sgul.ac.uk" + modules_url[0]
                # print("parse_modules_url: ", parse_modules_url)
                item['modules_en'] = remove_class(clear_lianxu_space(self.parse_modules(parse_modules_url))).replace("<p><img></p>", "").strip()
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en_url = response.xpath("//a[contains(text(),'Studying')]/@href").extract()
            # print("assessment_en_url: ", assessment_en_url)
            if len(assessment_en_url) != 0:
                parse_assessment_en_url = "https://www.sgul.ac.uk" + assessment_en_url[0]
                # print("parse_assessment_en_url: ", parse_assessment_en_url)
                item['assessment_en'] = remove_class(clear_lianxu_space(self.parse_assessment_en(parse_assessment_en_url))).replace("<p><img></p>", "").strip()
            # print("item['assessment_en']: ", item['assessment_en'])

            career_en_url = response.xpath("//a[contains(text(),'Career')]/@href").extract()
            # print("career_en_url: ", career_en_url)
            if len(career_en_url) != 0:
                parse_career_en_url = "https://www.sgul.ac.uk" + career_en_url[0]
                # print("parse_career_en_url: ", parse_career_en_url)
                item['career_en'] = remove_class(clear_lianxu_space(self.parse_career_en(parse_career_en_url))).replace("<p><img></p>", "").strip()
            # print("item['career_en']: ", item['career_en'])

            apply_proces_en_url = response.xpath("//a[contains(text(),'Apply')]/@href|//a[contains(text(),'Application and interview')]/@href").extract()
            print("apply_proces_en_url: ", apply_proces_en_url)
            if len(apply_proces_en_url) != 0:
                parse_apply_proces_en_url = "https://www.sgul.ac.uk" + apply_proces_en_url[0]
                print("parse_apply_proces_en_url: ", parse_apply_proces_en_url)
                item['apply_proces_en'] = remove_class(clear_lianxu_space(self.parse_apply_proces_en(parse_apply_proces_en_url))).replace("<p><img></p>", "").strip()
            print("item['apply_proces_en']: ", item['apply_proces_en'])

            # item['require_chinese_en'] = remove_class(clear_lianxu_space([""]))
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #16

Показать файл

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "University of Bradford"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        if "pg" in response.url:
            print("===========================")
            print(response.url)
            try:
                key_url = response.url.split("/")[-2].strip()

                programme = response.xpath(
                    "//div[@id='course-key-info']//div[@class='col-xs-12']/h1//text()"
                ).extract()
                item['programme_en'] = ''.join(programme).strip()
                print("item['programme_en']: ", item['programme_en'])

                degree_type = response.xpath(
                    "//p[@id='cAward']//text()").extract()
                item['degree_name'] = ''.join(degree_type).strip()
                print("item['degree_name']: ", item['degree_name'])

                if "phd" in item['programme_en'].lower(
                ) or item['degree_name'].lower() == "doctorate":
                    item['teach_type'] = 'phd'
                    item['degree_type'] = 3
                print("item['teach_type']: ", item['teach_type'])
                print("item['degree_type']: ", item['degree_type'])

                mode = response.xpath(
                    "//option[@value='fulltime']//text()|//span[@id='cAttendance']//text()"
                ).extract()
                clear_space(mode)
                item['teach_time'] = getTeachTime(''.join(mode))
                print("item['teach_time']: ", item['teach_time'])

                start_date_url = "https://www.bradford.ac.uk/courses/pg/pgapi.php?uri=/courses/pg/" + key_url + "/&startMonth=startMonth&level=pg&year=y2018&attendance=fulltime"
                print("start_date_url: ", start_date_url)
                start_date = json.loads(
                    requests.get(start_date_url).text).get("data")
                print("start_date: ", start_date)
                if start_date != None:
                    if "," in start_date:
                        start_date_list = start_date.split(",")
                        for s in start_date_list:
                            item['start_date'] += getStartDate(s.lower()) + ","
                    else:
                        item['start_date'] = getStartDate(
                            ''.join(start_date).lower())
                item['start_date'] = item['start_date'].strip().strip(
                    ",").strip()
                print("item['start_date']: ", item['start_date'])
                # start_date_year = response.xpath(
                #     "//div[@class='col-xs-5']//span[@id='displayYear']//text()").extract()
                # if len(start_date_year) != 0 and item['start_date'] != "":
                #     item['start_date'] = ''.join(start_date_year).strip() + "-" + item['start_date']
                # else:
                #     item['start_date'] = ''.join(start_date_year).strip()
                # print("item['start_date']: ", item['start_date'])

                item['location'] = 'Bradford West Yorkshire BD7 1DP UK'
                # print("item['location']: ", item['location'])

                duration_url = "https://www.bradford.ac.uk/courses/pg/pgapi.php?uri=/courses/pg/" + key_url + "/&duration=duration&level=pg&year=y2018&attendance=fulltime"
                # print("duration_url: ", duration_url)
                duration = json.loads(
                    requests.get(duration_url).text).get("data")
                # print("duration: ", duration)
                if duration != None:
                    duration_list = getIntDuration(''.join(duration))
                    if len(duration_list) == 2:
                        item['duration'] = duration_list[0]
                        item['duration_per'] = duration_list[-1]
                # print("item['duration'] = ", item['duration'])
                # print("item['duration_per'] = ", item['duration_per'])

                overview_en = response.xpath(
                    "//div[@id='overviewStripe']").extract()
                item['overview_en'] = remove_class(
                    clear_lianxu_space(overview_en))
                # print("item['overview_en']: ", item['overview_en'])

                entry_requirements = response.xpath(
                    "//div[@id='course-entry']//text()|//div[@id='nav-course-entry']//text()"
                ).extract()
                entry_requirements_str = ''.join(entry_requirements).strip()
                item['rntry_requirements'] = clear_lianxu_space(
                    entry_requirements)
                # print("item['rntry_requirements']: ", item['rntry_requirements'])

                ielts_desc = response.xpath(
                    "//div[@id='course-entry']//*[contains(text(),'IELTS')]//text()|"
                    "//div[@id='nav-course-entry']//*[contains(text(),'IELTS')]//text()"
                ).extract()

                # print("ielts_desc: ", ielts_desc)
                item['ielts_desc'] = ''.join(ielts_desc).strip()
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ielts_dict = get_ielts(item['ielts_desc'])
                item['ielts'] = ielts_dict.get('IELTS')
                item['ielts_l'] = ielts_dict.get('IELTS_L')
                item['ielts_s'] = ielts_dict.get('IELTS_S')
                item['ielts_r'] = ielts_dict.get('IELTS_R')
                item['ielts_w'] = ielts_dict.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                # item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                if item['ielts'] is None:
                    ielts_desc = re.findall(r"IELTS.{1,100}",
                                            entry_requirements_str)
                    clear_space(ielts_desc)
                    item['ielts_desc'] = ''.join(ielts_desc).strip()
                print("item['ielts_desc']: ", item['ielts_desc'])

                ielts_dict = get_ielts(item['ielts_desc'])
                item['ielts'] = ielts_dict.get('IELTS')
                item['ielts_l'] = ielts_dict.get('IELTS_L')
                item['ielts_s'] = ielts_dict.get('IELTS_S')
                item['ielts_r'] = ielts_dict.get('IELTS_R')
                item['ielts_w'] = ielts_dict.get('IELTS_W')
                print(
                    "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                    % (item['ielts'], item['ielts_l'], item['ielts_s'],
                       item['ielts_r'], item['ielts_w']))

                toefl_desc = re.findall(r"TOEFL.{1,250}",
                                        entry_requirements_str)
                clear_space(toefl_desc)
                item['toefl_desc'] = ''.join(toefl_desc).strip()
                # print("item['toefl_desc']: ", item['toefl_desc'])

                toefl_list = re.findall(r"\d\d+", item['toefl_desc'])
                # print(toefl_list)
                if len(toefl_list) == 1:
                    item['toefl'] = toefl_list[0]
                    # item['toefl_l'] = toefl_list[0]
                    # item['toefl_s'] = toefl_list[0]
                    # item['toefl_r'] = toefl_list[0]
                    # item['toefl_w'] = toefl_list[0]
                elif len(toefl_list) == 2:
                    item['toefl'] = toefl_list[0]
                    item['toefl_l'] = toefl_list[1]
                    item['toefl_s'] = toefl_list[1]
                    item['toefl_r'] = toefl_list[1]
                    item['toefl_w'] = toefl_list[1]
                elif len(toefl_list) == 5:
                    item['toefl'] = toefl_list[0]
                    item['toefl_l'] = toefl_list[1]
                    item['toefl_s'] = toefl_list[3]
                    item['toefl_r'] = toefl_list[2]
                    item['toefl_w'] = toefl_list[4]
                # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
                #                             item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

                modules = response.xpath(
                    "//div[@id='course-curriculum']").extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # print("item['modules_en']: ", item['modules_en'])

                assessment_en = response.xpath(
                    "//div[@class='row stripe background--green']").extract()
                item['assessment_en'] = remove_class(
                    clear_lianxu_space(assessment_en))
                # print("item['assessment_en']: ", item['assessment_en'])

                tuition_fee = response.xpath(
                    "//div[@id='tuitionFees']//p[contains(text(),'International:')]//text()"
                ).extract()
                if len(tuition_fee) == 0:
                    tuition_fee = response.xpath(
                        "//div[@id='tuitionFees']//text()").extract()
                clear_space(tuition_fee)
                # print("tuition_fee: ", tuition_fee)
                tuition_fee_re = re.findall(r"£\d+,\d+", ''.join(tuition_fee))

                if len(tuition_fee_re) > 0:
                    item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                    item['tuition_fee_pre'] = "£"
                else:
                    print("***")
                print("item['tuition_fee']: ", item['tuition_fee'])
                print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

                career_en = response.xpath(
                    "//div[@id='nav-course-career']").extract()
                item['career_en'] = remove_class(
                    clear_lianxu_space(career_en)).replace("<div></div>",
                                                           "").strip()
                # print("item['career_en']: ", item['career_en'])

                # apply_url_key = response.url.split("/")
                # print(apply_url_key)
                apply_url = "https://www.bradford.ac.uk/courses/pg/pgapi.php?uri=/courses/pg/" + key_url + "/&applyCTAModal=applyCTAModal&level=pg&year=y2018&attendance=fulltime"
                # print("apply_url: ", apply_url)
                apply = json.loads(requests.get(apply_url).text).get("data")
                if apply != None:
                    item['apply_proces_en'] = remove_class(
                        clear_lianxu_space([apply]))
                # print("item['apply_proces_en']: ", item['apply_proces_en'])

                item['require_chinese_en'] = remove_class(
                    clear_lianxu_space([
                        """<div class="entryReq __postgraduate"><h3>Postgraduate</h3><p>The entry requirement for a postgraduate taught course is typically equivalent to a UK Second Class Honours Second Division (2:2). For individual course requirements, please see the course details in the <a href="/courses/pg/">postgraduate course listings</a>.</p>
<p>The table below shows how the University equates qualifications from your country to UK degree classifications:</p>
<table>
<tbody>
<tr><th>Qualification&nbsp;</th><th>UK 1st Class&nbsp;</th><th>UK 2:1&nbsp;</th><th>UK 2:2&nbsp;</th></tr>
<tr>
<td>Bachelor Degree 学士学位</td>
<td>85%</td>
<td>80%</td>
<td>70%</td>
</tr>
</tbody>
</table></div>
"""
                    ]))
                # print("item['require_chinese_en']: ", item['require_chinese_en'])

                department_dict = {
                    "Advanced Biomedical Engineering":
                    "Engineering & Informatics",
                    "Advanced Chemical and Petroleum Engineering":
                    "Engineering & Informatics",
                    "Advanced Civil and Structural Engineering":
                    "Engineering & Informatics",
                    "Advanced Mechanical Engineering":
                    "Engineering & Informatics",
                    "Big Data Science and Technology":
                    "Engineering & Informatics",
                    "Cyber Security": "Engineering & Informatics",
                    "Filmmaking": "Engineering & Informatics",
                    "Internet of Things (IoT)": "Engineering & Informatics",
                    "Nursing Studies (International)": "Health Studies",
                    "PhD (Faculty of Health Studies)": "Health Studies",
                    "Public Health": "Health Studies",
                    "Analytical Sciences": "Life Sciences",
                    "Analytical Sciences": "Life Sciences",
                    "Archaeological Sciences": "Life Sciences",
                    "Archaeological Sciences": "Life Sciences",
                    "Bioinformatics and Computational Biosciences":
                    "Life Sciences",
                    "Cancer Drug Discovery": "Life Sciences",
                    "Cancer Pharmacology": "Life Sciences",
                    "Doctorate in Medicine": "Life Sciences",
                    "Drug Toxicology and Safety Pharmacology": "Life Sciences",
                    "Forensic Archaeology and Crime Scene Investigation":
                    "Life Sciences",
                    "Forensic Archaeology and Crime Scene Investigation":
                    "Life Sciences",
                    "Human Osteology and Palaeopathology": "Life Sciences",
                    "Human Osteology and Palaeopathology": "Life Sciences",
                    "Materials Chemistry": "Life Sciences",
                    "Medical Bioscience": "Life Sciences",
                    "Optometry Progression to Pre-registration Period":
                    "Life Sciences",
                    "Pharmaceutical Technology and Medicines Control":
                    "Life Sciences",
                    "PhD (School of Pharmacy and Medical Sciences)":
                    "Life Sciences",
                    "Skin Sciences and Regenerative Medicine": "Life Sciences",
                    "Applied Management and Entrepreneurship":
                    "Management & Law",
                    "European and International Business Management":
                    "Management & Law",
                    "Finance and Investment": "Management & Law",
                    "Finance, Accounting and Management": "Management & Law",
                    "Financial Management": "Management & Law",
                    "MSc Human Resource Management (CIPD Accreditation)":
                    "Management & Law",
                    "International Business and Management":
                    "Management & Law",
                    "International Commercial Law": "Management & Law",
                    "International Human Rights and Development":
                    "Management & Law",
                    "International Legal Studies": "Management & Law",
                    "International Strategic Marketing": "Management & Law",
                    "Logistics, Data Analytics and Supply Chain Management":
                    "Management & Law",
                    "Management": "Management & Law",
                    "Marketing and Management": "Management & Law",
                    "Natural Resources and Environmental Law and Policy":
                    "Management & Law",
                    "PhD (School of Law)": "Management & Law",
                    "PhD (School of Management)": "Management & Law",
                    "Advanced Practice in Peacebuilding and Conflict Resolution":
                    "Social Sciences",
                    "Economics and Finance for Development": "Social Sciences",
                    "International Development Management": "Social Sciences",
                    "International Relations and Security Studies":
                    "Social Sciences",
                    "Peace, Conflict and Development": "Social Sciences",
                    "Peace, Resilience and Social Justice": "Social Sciences",
                    "PhD (Faculty of Social Sciences)": "Social Sciences",
                    "Project Planning and Management": "Social Sciences",
                    "Psychology": "Social Sciences",
                    "Psychology of Health and Wellbeing": "Social Sciences",
                    "Social Work": "Social Sciences",
                    "Sociology, Social Policy and Crime": "Social Sciences",
                    "Sustainable Development": "Social Sciences",
                }
                item['department'] = department_dict.get(
                    item['programme_en'].strip())
                # print("item['department']: ", item['department'])
                if item['teach_time'] == "fulltime":
                    yield item
            except Exception as e:
                with open(item['university'] + str(item['degree_type']) +
                          ".txt",
                          'a',
                          encoding="utf-8") as f:
                    f.write(
                        str(e) + "\n" + response.url +
                        "\n========================\n")
                print("异常：", str(e))
                print("报错url：", response.url)

Пример #17

Показать файл

Файл: TheUniversityOfEdinburgh_R.py Проект: histudent/python_spider

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "The University of Edinburgh"
        # item['country'] = 'England'
        # item['website'] = 'https://www.ed.ac.uk/'
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'phd'
        # 学位类型
        item['degree_type'] = 3
        print("===========================")
        print(response.url)
        try:
            # 专业
            programme = response.xpath(
                "//h1[@itemprop='headline']//text()").extract()
            clear_space(programme)
            item['programme_en'] = ''.join(programme)
            print("item['programme_en']: ", item['programme_en'])

            degree_name = response.xpath(
                "//span[contains(text(),'Awards:')]/../text()").extract()
            if len(degree_name) > 0:
                item['degree_name'] = degree_name[0]
            print("item['degree_name']: ", item['degree_name'])

            teach_time = response.xpath(
                "//span[contains(text(),'Study modes:')]/../text()").extract()
            teach_time = ''.join(teach_time)
            # teach_time_re = re.findall(r"[a-zA-Z]{4}-time", teach_time)
            # print("teach_time_re: ", teach_time_re)
            item['teach_time'] = getTeachTime(teach_time)
            # item['teach_time'] = item['teach_time'].replace("parttime", "").replace(',', '')
            # print("item['teach_time']: ", item['teach_time'])

            department = response.xpath(
                "//div[@class='col-xs-12']//div[@class='row']//div[@class='col-xs 12']//ul//li//span[contains(text(),'College:')]/following-sibling::*//text()").extract()
            if len(department) == 0:
                department = response.xpath(
                    "//div[@class='col-xs-12']//div[@class='row']//div[@class='col-xs 12']//ul//li//span[contains(text(),'School:')]/following-sibling::a[1]/text()").extract()
            clear_space(department)
            item['department'] = ''.join(department).strip()
            # print("item['department']: ", item['department'])

            # //div[@class='col-xs-12']//div[@class='row']//div[@class='col-xs-12']//ul[@class='addressList']//li[@class='contactCampus']
            location = response.xpath(
                "//div[@class='col-xs-12']//div[@class='row']//div[@class='col-xs-12']//ul[@class='addressList']//li[@class='contactCampus']/text()").extract()
            clear_space(location)
            item['location'] = ''.join(location).strip()
            # print("item['location']: ", item['location'])

            # //option[@value='0010']
            start_date = response.xpath(
                "//select[@name='code2']//option//text()").extract()
            clear_space(start_date)
            # print(start_date)
            if len(start_date) > 0:
                start_date = start_date[0].strip()
            # print("item['start_date']: ", item['start_date'])
                item['start_date'] = getStartDate(start_date)
            # print("item['start_date'] = ", item['start_date'])

            overview = response.xpath(
                "//div[@id='proxy_collapseresearch_profile']/..").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            duration = response.xpath(
                "//table[@class='table table-striped']//tbody//tr[1]/td[3]//text()").extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration = ''.join(duration).strip()
            duration_int = re.findall(r"\d+", duration)
            if len(duration_int) != 0:
                item['duration'] = int(''.join(duration_int))
            # print("item['duration']: ", item['duration'])

            if "year" in duration or "Year" in duration:
                item['duration_per'] = 1
            if "month" in duration or "Month" in duration:
                item['duration_per'] = 3
            # print("item['duration_per']: ", item['duration_per'])


            # //div[@id='proxy_collapseprogramme']
            modules1 = response.xpath(
                "//div[@id='proxy_collapsehow_taught']/div/*[position()<=last()]").extract()
            # clear_space(modules1)
            modules2url = response.xpath(
                "//html//tr[1]/td[5]/a/@href").extract()
            modules2 = ""
            if len(modules2url) != 0:
                modules2url = ''.join(modules2url)
                modules2 = self.get_modules2(modules2url)
            item['modules_en'] = remove_class(clear_lianxu_space(list(modules1)))
            if modules2 != "":
                item['modules_en'] += "\n" + modules2
            # print("item['modules_en']: ", item['modules_en'])

            career = response.xpath(
                "//div[@id='proxy_collapsecareer_opp']/..").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            # //div[@id='proxy_collapseentry_req']
            entry_requirements = response.xpath(
                "//div[@id='proxy_collapseentry_req']/..//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            IELTS = response.xpath("//abbr[contains(text(),'IELTS')]/..//text()").extract()
            item['ielts_desc'] = ''.join(IELTS)
            print("item['ielts_desc']: ", item['ielts_desc'])

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            TOEFL = response.xpath("//abbr[contains(text(),'TOEFL')]/..//text()").extract()
            item['toefl_desc'] = ''.join(TOEFL)
            print("item['toefl_desc']: ", item['toefl_desc'])

            toeflDict = get_toefl(item['toefl_desc'])
            item['toefl'] = toeflDict.get("TOEFL")
            item['toefl_l'] = toeflDict.get("TOEFL_L")
            item['toefl_s'] = toeflDict.get("TOEFL_S")
            item['toefl_r'] = toeflDict.get("TOEFL_R")
            item['toefl_w'] = toeflDict.get("TOEFL_W")
            print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
                    item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            tuition_feeDict = {}
            tuition_fee_url = response.xpath("//div[@id='proxy_collapsefees']//ul/li/a[contains(text(),'Full')]/@href").extract()
            # print("tuition_fee_url: ", tuition_fee_url)
            if len(tuition_fee_url) > 0:
                tuition_fee_url_str = tuition_fee_url[0]
                fee = self.parse_tuition_fee(tuition_fee_url_str)
                clear_space(fee)
                fee_re = re.findall(r"£\d+,\d+", ''.join(fee))
                # print("fee_re: ", fee_re)
                item['tuition_fee'] = getTuition_fee(''.join(fee_re))
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])

            item['require_chinese_en'] = "https://www.ed.ac.uk/studying/international/postgraduate-entry/asia/china"
            item['apply_proces_en'] = "https://www.ed.ac.uk/studying/postgraduate/applying"
            # apply_proces_en = response.xpath(
            #     "//div[@id='proxy_collapseHowToApply']/..").extract()
            # item['apply_proces_en'] = remove_class(clear_lianxu_space(apply_proces_en))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            yield item
        except Exception as e:
            with open(item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #18

Показать файл

Файл: UniversityOfLincoln_P.py Проект: histudent/python_spider

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.lincoln.ac.uk/"
        item['university'] = "University of Lincoln"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        item[
            'location'] = 'University of Lincoln, Brayford Pool, Lincoln, LN6 7TS'
        print("===========================")
        print(response.url)
        try:
            # //table[@id='newTitle']/tbody[@id='newTitleBody']/tr/td/h1[1]/a
            programmeDegreetype = response.xpath(
                "//div[@id='CourseTitleApms']//h1[@class='nd_2019-20']//text()"
            ).extract()
            clear_space(programmeDegreetype)
            # print("programmeDegreetype: ", programmeDegreetype)
            programmeDegreetypeStr = ''.join(programmeDegreetype)

            degree_type = re.findall(
                r"^(M\w+\sby\sResearch\s/[/\w]+\s|M\w+\sby\sResearch|PG\s\w+|\w+/\w+|\w+)",
                programmeDegreetypeStr)
            # print("degree_type: ", degree_type)
            item['degree_name'] = ''.join(degree_type)
            print("item['degree_name']: ", item['degree_name'])

            if "phd" in item['degree_name'].lower():
                item['teach_type'] = 'phd'
                item['degree_type'] = 3
            if "by research" in item['degree_name'].lower(
            ) or item['degree_name'] == "MRes":
                item['teach_type'] = 'research'
                item['degree_type'] = 3
            print("item['teach_type']: ", item['teach_type'])
            # print("item['degree_type']: ", item['degree_type'])

            programme = programmeDegreetypeStr.replace(''.join(degree_type),
                                                       '')
            # if len(programme) > 0:
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            # //span[@id='durationFT']
            duration = response.xpath(
                "//span[contains(text(),'Full-time Duration')]/..//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_str = ''.join(duration)

            item['teach_time'] = getTeachTime(duration_str)
            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['teach_time'] = ", item['teach_time'])
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            department = response.xpath(
                "//span[contains(text(),'School:')]/following-sibling::a//text()"
            ).extract()
            clear_space(department)
            if len(department) > 0:
                item['department'] = department[0]
            print("item['department']: ", item['department'])

            dep_dict = {
                "lincoln school of architecture and the built environment":
                "College of Arts",
                "lincoln school of design":
                "College of Arts",
                "lincoln school of film and media":
                "College of Arts",
                "school of english and journalism":
                "College of Arts",
                "school of fine and performing arts":
                "College of Arts",
                "school of history and heritage":
                "College of Arts",
                "school of chemistry":
                "College of Science",
                "school of computer science":
                "College of Science",
                "school of engineering":
                "College of Science",
                "school of geography":
                "College of Science",
                "school of life sciences":
                "College of Science",
                "school of mathematics and physics":
                "College of Science",
                "school of pharmacy":
                "College of Science",
                "national centre for food manufacturing":
                "College of Science",
                "lincoln institute for agri-tech":
                "College of Science",
                "school of education":
                "College of Social Science",
                "school of health and social care":
                "College of Social Science",
                "professional development centre":
                "College of Social Science",
                "lincoln law school":
                "College of Social Science",
                "school of psychology":
                "College of Social Science",
                "school of social and political sciences":
                "College of Social Science",
                "school of sport and exercise science":
                "College of Social Science",
            }
            if item['department'] != "Lincoln Business School":
                item['department'] = dep_dict.get(item['department'].lower())
            print("item['department']1: ", item['department'])

            if item['department'] == None:
                item['department'] = ''.join(
                    response.xpath(
                        "//div[@class='breadcrumb-list']//span//a[@href='/home/collegeofsocialscience/']//text()"
                    ).extract()).strip()
                print("item['department']2: ", item['department'])

            # //div[@id='feesTables']/table
            fee = response.xpath(
                "//td[contains(text(),'International')]/following-sibling::td//text()"
            ).extract()
            clear_space(fee)
            # print("fee: ", fee)
            feeStr = ''.join(fee)
            tuitionfee = getTuition_fee(feeStr)
            item['tuition_fee'] = tuitionfee
            if item['tuition_fee'] == 0:
                item['tuition_fee'] = None
            else:
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //h2[contains(text(),'The Course')]/..
            overview = response.xpath(
                "//h2[contains(text(),'The Course')]/..").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            modules_en = response.xpath(
                """//body/section[@class='container basic-accordion']/div[@class='row']/div[@class='col-md-9 no-gutters']/div[@id='accordion']/div[@class="nd_2019-20"]//a[contains(text(),'How you study')]/../../..|
                                        //body/section[@class='container basic-accordion']/div[@class='row']/div[@class='col-md-9 no-gutters']/div[@id='accordion']/div[@class="nd_2019-20"]//a[contains(text(),'How You Study')]/../../..|
                                        //body/section[@class='container basic-accordion']/div[@class='row']/div[@class='col-md-9 no-gutters']/div[@id='accordion']/div[@class="nd_2019-20"]//a[contains(text(),'Modules')]/../../..|
                                        //body/section[@class='container basic-accordion']/div[@class='row']/div[@class='col-md-9 no-gutters']/div[@id='accordion']/div[@class="nd_2019-20"]//a[contains(text(),'Research Areas, Projects & Topics')]/../../.."""
            ).extract()
            if len(modules_en) == 0:
                modules_en = response.xpath(
                    """//a[contains(text(),'How you study')]/../../..|
                                        //a[contains(text(),'How You Study')]/../../..|
                                        //a[contains(text(),'Modules')]/../../..|
                                        //a[contains(text(),'Research Areas, Projects & Topics')]/../../.."""
                ).extract()
            # 需要去除的多余的内容
            del_modules_en = response.xpath(
                "//div[@id='collapse62019-20']//div[@id='modulePanelPrint']"
            ).extract()
            del_modules_en_str = remove_class(
                clear_lianxu_space(del_modules_en))
            print(modules_en)
            item['modules_en'] = remove_class(
                clear_lianxu_space(modules_en)).replace(
                    del_modules_en_str, '').strip()
            if item['modules_en'] == "":
                item['modules_en'] = None
                # print("*** modules_en")
            else:
                print("===", item['modules_en'])
                del_cont = re.findall(
                    r"<br>Find out more</p><div><span>.*?</em></span>",
                    item['modules_en'])
                print("del_cont==", del_cont)
                if len(del_cont) > 0:
                    for delc in del_cont:
                        item['modules_en'] = item['modules_en'].replace(
                            delc, '<div>').strip()
            print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath(
                "//a[contains(text(),'How You Are Assessed')]/../../..|//a[contains(text(),'How you are assessed')]/../../.."
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            interview_desc_en = response.xpath(
                "//a[contains(text(),'Interviews & Applicant Days')]/../../.."
            ).extract()
            item['interview_desc_en'] = remove_class(
                clear_lianxu_space(interview_desc_en))
            # print("item['interview_desc_en']: ", item['interview_desc_en'])

            rntry_requirements = response.xpath(
                "//a[contains(text(),'Entry Requirements')]/../../..//text()|//a[contains(text(),'Entry requirements')]/../../..//text()"
            ).extract()
            item['rntry_requirements'] = clear_lianxu_space(rntry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts = re.findall(r"IELTS.{1,80}", item['rntry_requirements'])
            item['ielts_desc'] = ''.join(ielts).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            career = response.xpath("//div[@id='CourseCareersApms']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            # http://www.lincoln.ac.uk/home/studywithus/internationalstudents/entryrequirementsandyourcountry/china/
            item["require_chinese_en"] = remove_class(
                clear_lianxu_space([
                    """<p><strong>Master's</strong></p>
<p>Prospective students require one of the following qualifications:</p>
<ul>
<li>A Chinese degree from a recognised institution with a minimum average grade of 70% (GPA 2.5), some programmes may require 80% or a GPA 3.0</li>
<li>Successful completion of a UK Bachelor degree with a minimum grade of 2:2</li>
<li>Students with a three year Chinese Diploma who have gained at least 3 years full-time relevant work experience may be considered for our MBA programme on a case by case basis</li>
</ul>"""
                ]))
            if item['teach_type'] == "phd":
                item['require_chinese_en'] = remove_class(
                    clear_lianxu_space([
                        """<p><strong>PhD</strong></p>
<p><span>Successful completion of a Master's Degree from a recognised institution.</span></p>
"""
                    ]))
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            if item['ielts_desc'] == "":
                item[
                    'ielts_desc'] = "Prospective students require IELTS 6.0 (with no less than 5.5 in each band score) or an equivalent qualification. Please note that some courses require a higher score."
                item['ielts'] = 6.0
                item['ielts_l'] = 5.5
                item['ielts_s'] = 5.5
                item['ielts_r'] = 5.5
                item['ielts_w'] = 5.5
            # print("******item['ielts_desc']: ", item['ielts_desc'])
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            # http://www.lincoln.ac.uk/home/studywithus/internationalstudents/englishlanguagerequirementsandsupport/englishlanguagerequirements/
            if item['ielts'] == "6.5":
                item['toefl'] = 90
                item['toefl_l'] = 20
                item['toefl_s'] = 22
                item['toefl_r'] = 21
                item['toefl_w'] = 22
            elif item['ielts'] == "7.0":
                item['toefl'] = 100
                item['toefl_l'] = 22
                item['toefl_s'] = 23
                item['toefl_r'] = 23
                item['toefl_w'] = 23
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<h4 class="h2">Follow these five simple steps to apply for a postgraduate course at Lincoln:</h4>
<p class="h2">1. Find your course</p>
<p>On this website you will find an overview of the <a href="/home/studywithus/postgraduatestudy/">postgraduate courses</a> available at the University of Lincoln.</p>
<p>Choose the course you wish to study, making sure you check the entry requirements.</p>
<p>We strongly recommend you attend a <a href="/home/studywithus/opendaysandvisits/postgraduatetasterdays/">Postgraduate Taster Day</a> to find out more.</p>
<p class="h2">2. Check for a closing date</p>
<p>Most of our postgraduate courses have no official closing date for applications. The majority of our taught courses start in September, although some courses have intakes in January or February. Please allow enough time for your application to be considered prior to the start date. If you are an international student you may need to factor in time for your visa application. We would advise you to apply as soon as possible.</p>
<p class="h2">3. Are you eligible for a postgraduate loan or scholarship?</p>
<p>The government has announced a new system of Postgraduate Loans where eligible full-time and part-time students could borrow up to &pound;10,609 towards the cost of a taught postgraduate Master&rsquo;s qualification. <a href="/home/studywithus/postgraduatestudy/feesandfunding/">Visit our Postgraduate Fees and Funding page</a> to find out more. The University of Lincoln also offers a range of postgraduate <a href="/home/studywithus/scholarshipsandbursaries/">scholarships</a>.</p>
<p class="h2">4. Research candidates only - compose your research proposal</p>
<p>If you are applying for a research programme, you will need to draft your research proposal. In your application you will be asked to give a description of the topic or theme you intend to research.</p>
<p class="h2">5. Apply online</p>
<p>When you have found the course you are interested in, go to the course page and click <a href="https://my.lincoln.ac.uk/welcome/pages/login.aspx" target="_blank">&lsquo;Apply Online&rsquo;</a>.</p>
<p>You will need to register with us first to proceed.</p>"""
                ]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #19

Показать файл

Файл: UniversityOfPortsmouth_P.py Проект: histudent/python_spider

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.port.ac.uk/"
        item['university'] = "University of Portsmouth"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        item[
            'location'] = 'University House, Winston Churchill Avenue, Portsmouth PO1 2UP'
        print("===========================")
        print(response.url)
        try:
            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            programme = response.xpath(
                "//div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1//text()|"
                "//h1[@class='Title']/text()").extract()
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            degree_type = response.xpath(
                "//div[@class='course_title']/span//text()|"
                "//h1[@class='Title']/small//text()").extract()
            item['degree_name'] = ''.join(degree_type).strip()
            print("item['degree_name']: ", item['degree_name'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            department = response.xpath(
                "//div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1//text()|"
                "//dt[contains(text(), 'Department')]/following-sibling::dd[1]//text()"
            ).extract()
            item['department'] = ''.join(department)
            # print("item['department']: ", item['department'])

            item['start_date'] = response.meta.get(response.url)
            print("item['start_date']1 = ", item['start_date'])
            if item['start_date'] is not None:
                if "," in item['start_date']:
                    start_date_re = item['start_date'].split(',')
                    start_date_str = ""
                    for s in start_date_re:
                        start_date_str += getStartDate(s) + ","
                    item['start_date'] = start_date_str.strip().strip(
                        ',').strip()
            print("item['start_date'] = ", item['start_date'])

            # //div[@class='video']/div[@class='video_title']/div/div[@class='course_title']/h1
            duration = response.xpath(
                "//div[contains(text(),'Duration')]/following-sibling::*//text()|"
                "//dt[contains(text(), 'Duration')]/following-sibling::dd[1]//text()|"
                "//dt[contains(text(), 'duration')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(duration)
            print("duration: ", duration)
            duration_str = ''.join(duration)
            item['other'] = duration_str
            item['teach_time'] = getTeachTime(duration_str)

            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            print("item['teach_time'] = ", item['teach_time'])
            print("item['duration'] = ", item['duration'])
            print("item['duration_per'] = ", item['duration_per'])

            location = response.xpath(
                "//div[contains(text(),'Location')]/following-sibling::*//text()"
            ).extract()
            item['location'] = ''.join(location)
            # print("item['location']: ", item['location'])

            # //strong[contains(text(),'International students')]/../following-sibling::p[1]
            tuition_fee = response.xpath(
                " //strong[contains(text(),'International students')]/../following-sibling::p//text()"
            ).extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(
                r"Full\stime:\s£\d+,\d+|Full\stime\s£\d+,\d+",
                ''.join(tuition_fee))
            # print("tuition_fee_re: ", tuition_fee_re)
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = int(tuition_fee_re[0].replace(
                    "Full time",
                    "").replace(":", "").replace(",", "").replace("£",
                                                                  "").strip())
            # print("item['tuition_fee']: ", item['tuition_fee'])

            if item['tuition_fee'] == None:
                # //strong[contains(text(),'International students')]/../following-sibling::p[1]
                tuition_fee = response.xpath(
                    "//h3[contains(text(),'Tuition fees')]/..//*[contains(text(),'International students')]/following-sibling::*//*[contains(text(),'Full')]//text()|"
                    "//h3[contains(text(),'Tuition fees')]/..//*[contains(text(),'International students')]/../following-sibling::*//*[contains(text(),'Full')]//text()|"
                    "//h3[contains(text(),'Tuition fees')]/..//h4[contains(text(),'Full-time')]/following-sibling::*[position()<3]//*[contains(text(),'International students')]/../text()"
                ).extract()
                clear_space(tuition_fee)
                # print("tuition_fee: ", tuition_fee)
                tuition_fee_re = re.findall(r"£\d+,\d+", ''.join(tuition_fee))
                # print("tuition_fee_re: ", tuition_fee_re)
                if len(tuition_fee_re) > 0:
                    item['tuition_fee'] = int(tuition_fee_re[0].replace(
                        ",", "").replace("£", "").strip())
                # print("item['tuition_fee']: ", item['tuition_fee'])

            rntry_requirements_content = response.xpath(
                "//h3[contains(text(),'Key Facts')]/..//text()").extract()
            clear_space(rntry_requirements_content)
            # print("rntry_requirements_content: ", rntry_requirements_content)
            if "2018 ENTRY REQUIREMENTS" in rntry_requirements_content:
                rntry_requirements_index = rntry_requirements_content.index(
                    "2018 ENTRY REQUIREMENTS")
                if "Fees" in rntry_requirements_content:
                    rntry_requirements_indexEnd = rntry_requirements_content.index(
                        "Fees")
                    item['rntry_requirements'] = clear_lianxu_space(
                        rntry_requirements_content[rntry_requirements_index:
                                                   rntry_requirements_indexEnd]
                    )
            if "2018 entry requirements" in rntry_requirements_content:
                rntry_requirements_index = rntry_requirements_content.index(
                    "2018 entry requirements")
                if "Fees" in rntry_requirements_content:
                    rntry_requirements_indexEnd = rntry_requirements_content.index(
                        "Fees")
                    item['rntry_requirements'] = clear_lianxu_space(
                        rntry_requirements_content[rntry_requirements_index:
                                                   rntry_requirements_indexEnd]
                    )
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            if item['rntry_requirements'] == "":
                rntry_requirements_content = response.xpath(
                    "//div[contains(text(),'Entry Requirements')]/../../..//div[contains(text(),'2018 start')]/../../../..//text()|"
                    "//div[contains(text(),'Entry requirements')]/../../..//div[contains(text(),'2018 start')]/../../../..//text()"
                ).extract()
                item['rntry_requirements'] = clear_lianxu_space(
                    rntry_requirements_content)
            print("item['rntry_requirements']: ", item['rntry_requirements'])

            ieltsList = re.findall(r".{1,45}IELTS.{1,45}",
                                   item['rntry_requirements'])
            item['ielts_desc'] = ''.join(ieltsList).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            overview = response.xpath(
                """//h2[@id='overview']/..|//h3[contains(text(),'What you’ll experience')]/..|//*[contains(text(),"What you'll experience")]/..|
                                    //h4[contains(text(),"On this course, you'll:")]/../..|//h3[contains(text(),"What you'll experience")]/../preceding-sibling::*[2]|
                                    //h3[contains(text(),'Why take this course?')]/../*[not(@class='blockquote-img')]"""
            ).extract()
            item['overview_en'] = remove_class(
                clear_lianxu_space(overview)
            ).replace(
                "<div><div>Get a prospectus</div><div>Book an Open Evening</div><div>Apply Now</div></div>",
                "").strip()
            print("item['overview_en']: ", item['overview_en'])

            modules = response.xpath(
                "//h2[@id='What youll study']/..|//h2[@id='What youll study']/../following-sibling::div[1]|//div[contains(text(),'Units currently being studied')]/../../..|"
                "//h3[@id='structure']/../../following-sibling::div[1]"
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            teaching_assessment = response.xpath(
                "//h2[@id='Teaching']/..|//h2[@id='Teaching']/../following-sibling::*[1]|"
                "//h2[@id='How youre assessed']/..|//h2[@id='How youre assessed']/../following-sibling::*[1]|"
                "//div[@class='pure-g purple content']/div[1]/div[@class='box']"
            ).extract()
            if len(teaching_assessment) == 0:
                teaching_assessment = response.xpath(
                    "//h3[contains(text(), 'Teaching')]/preceding-sibling::*[1]/following-sibling::*[position()<3]"
                ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(teaching_assessment))
            # print("item['assessment_en']: ", item['assessment_en'])

            career = response.xpath(
                "//h3[contains(text(),'Careers and opportunities')]/..|"
                "//div[@class='box container content pure-g']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<h3>Postgraduate courses</h3>
<p>For entry to our postgraduate Master's programmes, you'll usually need to have one of the following from a recognised Higher Education institution:</p>
<ul>
    <li>a Bachelor's degree (normally from a four year undergraduate programme)</li>
    <li>a Bachelor's degree from Higher Education Self-Study Examinations (full time)</li>
    <li>a top-up degree or university-recognised Pre-Master&rsquo;s Foundation programme</li>
</ul>
<p>Typical minimum Grade Point Average (GPA) requirements:</p>
<ul>
    <li>From 2.8 on a scale of 1-4</li>
    <li>From 7 on a scale of 1-10</li>
</ul>
<p>If you don't meet the postgraduate entry requirements, you can do a pre-Master's programme at<a rel="noopener noreferrer" href="http://www.icp.navitas.com/"></a><a rel="noopener noreferrer" href="https://www.icp.navitas.com/" target="_blank">International College Portsmouth (ICP)</a>&nbsp;for many of our courses.</p>"""
                ]))
            item[
                'apply_proces_en'] = "https://www.port.ac.uk/study/international-students/how-to-apply"
            yield item
        except Exception as e:
            with open("scrapySchool_England/error/" + item['university'] +
                      str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #20

Показать файл

Файл: UniversityOfBristol_P.py Проект: histudent/python_spider

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "University of Bristol"
        # items['country'] = "England"
        # items["website"] = "https://www.bristol.ac.uk/"
        item['url'] = response.url
        # 授课方式
        # item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        try:
            # 专业
            course = response.xpath("//h1[@id='pagetitle']/span//text()").extract()
            # print("course = ", course)
            item['programme_en'] = ''.join(course).replace("\n", " ").replace("\r", " ").strip()
            print("item['programme_en']: ", item['programme_en'])

            # degreeaward
            degreeaward = response.xpath("//th[contains(text(),'Awards available')]/following-sibling::td[1]//text()").extract()
            # print("degreeaward = ", degreeaward)
            item['degree_name'] = clear_space_str(''.join(degreeaward))
            print("item['degree_name']: ", item['degree_name'])

            if "phd" in item['degree_name'].lower() or "md" in item['degree_name'].lower():
                item['teach_type'] = "phd"
                if "research" in item['degree_name'].lower():
                    item['teach_type'] += " " + "research"
                item['degree_type'] = 3
            elif "research" in item['degree_name'].lower():
                item['teach_type'] = "research"
                item['degree_type'] = 3
            else:
                item['teach_type'] = "taught"
                item['degree_type'] = 2
            # print("item['degree_type']: ", item['degree_type'])
            # print("item['teach_type']: ", item['teach_type'])

            # duration
            duration = response.xpath("//th[@scope='row'][contains(text(),'Programme length')]/following-sibling::td[1]//text()").extract()
            clear_space(duration)
            # print("duration: ", duration)
            item['teach_time'] = getTeachTime(''.join(duration))
            # print("item['teach_time']: ", item['teach_time'])

            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            # location
            location = response.xpath("//th[@scope='row'][contains(text(),'Location of programme')]/following-sibling::td[1]//text()").extract()
            # print("location = ", location)
            item['location'] = clear_space_str(''.join(location))
            # print("item['location']: ", item['location'])

            # startdate
            startdate = response.xpath("//th[@scope='row'][contains(text(),'Start date')]/following-sibling::td[1]//text()").extract()
            clear_space(startdate)
            print("startdate = ", startdate)
            if len(startdate) > 0:
                # item['start_date'] = startdate[-1].strip()
                # print("item['start_date']: ", item['start_date'])
                item['start_date'] = getStartDate(''.join(startdate[-1]))
            print("item['start_date'] = ", item['start_date'])

            # deadline
            deadline = response.xpath("//div[@id='apply']/div[@class='apply-deadline']/p[1]//text()").extract()
            # print("deadline = ", deadline)
            item['deadline'] = getStartDate(''.join(deadline))
            # print("item['deadline']: ", item['deadline'])

            # department
            department = response.xpath("//div[@id='contact']/p[@class='pg-contact-address']/text()").extract()
            clear_space(department)
            # print("department1 = ", department)
            for d in department:
                if "School" in d or "Faculty" in d:
                    item['department'] = d
            # print("item['department']: ", item['department'])
            if item['department'] == "":
                allcontent = response.xpath("//main[@class='content']//text()").extract()
                clear_space(allcontent)
                department_re = re.findall(r"School\sof.{1,30}", ''.join(allcontent), re.I)
                # print("department_re: ", department_re)
                if len(department_re) > 0:
                    item['department'] = department_re[0].strip()
            # print("item['department']1: ", item['department'])

            # overview  //div[@id='programme-overview']//text()
            overview = response.xpath("//div[@id='programme-overview']|//div[@id='pgr-overview']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            # tuitionFee   //div[@id='fees']
            tuitionFee = response.xpath("//dt[contains(text(),'Overseas: full-time')]/following-sibling::dd[1]//text()").extract()
            clear_space(tuitionFee)
            print("tuitionFee = ", tuitionFee)
            if len(tuitionFee) > 0:
                item['tuition_fee_pre'] = "£"
                item['tuition_fee'] = int(''.join(tuitionFee[0]).replace("£", "").replace(",", "").strip())

            if item['tuition_fee'] is None:
                tuitionFee1 = response.xpath(
                    "//dl//dt[contains(text(),'Overseas:')]/following-sibling::dd[1]//text()").extract()
                clear_space(tuitionFee1)
                print("tuitionFee1 = ", tuitionFee1)
                if len(tuitionFee1) > 0:
                    item['tuition_fee_pre'] = "£"
                    item['tuition_fee'] = getTuition_fee(''.join(tuitionFee1))
                if item['tuition_fee'] == 0:
                    item['tuition_fee_pre'] = ""
                    item['tuition_fee'] = None
            if item['tuition_fee'] is None:
                print("tuition_fee 为空")
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])
            print("item['tuition_fee']: ", item['tuition_fee'])

            # modules   //div[@id='programme-structure']
            modules = response.xpath("//div[@id='programme-structure']|//div[@id='pgr-research-groups']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            print("item['modules_en']: ", item['modules_en'])

            # 学术要求本科特殊专业要求、IELTS
            entryRequirements = response.xpath("//div[@id='entry-requirements']//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entryRequirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts = response.xpath("//*[contains(text(),'Profile')]//text()|//div[contains(text(),'IELTS')]//text()").extract()
            item['ielts_desc'] = clear_lianxu_space(ielts)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            if item['ielts_desc'] == "Profile A":
                item['ielts'] = 7.5
                item['ielts_l'] = 7.0
                item['ielts_s'] = 7.0
                item['ielts_r'] = 7.0
                item['ielts_w'] = 7.0
                item['toefl'] = 109
                item['toefl_l'] = 25
                item['toefl_r'] = 25
                item['toefl_s'] = 25
                item['toefl_w'] = 29
            elif item['ielts_desc'] == "Profile B":
                item['ielts'] = 7.0
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.5
                item['ielts_w'] = 6.5
                item['toefl'] = 100
                item['toefl_l'] = 24
                item['toefl_r'] = 24
                item['toefl_s'] = 24
                item['toefl_w'] = 24
            elif item['ielts_desc'] == "Profile C":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.5
                item['ielts_w'] = 6.5
                item['toefl'] = 92
                item['toefl_l'] = 23
                item['toefl_r'] = 23
                item['toefl_s'] = 23
                item['toefl_w'] = 24
            elif item['ielts_desc'] == "Profile D":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.0
                item['ielts_s'] = 6.0
                item['ielts_r'] = 7.0
                item['ielts_w'] = 7.0
                item['toefl'] = 92
                item['toefl_l'] = 21
                item['toefl_r'] = 21
                item['toefl_s'] = 21
                item['toefl_w'] = 27
            elif item['ielts_desc'] == "Profile E":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.0
                item['ielts_s'] = 6.0
                item['ielts_r'] = 6.0
                item['ielts_w'] = 6.0
                item['toefl'] = 90
                item['toefl_l'] = 20
                item['toefl_r'] = 20
                item['toefl_s'] = 20
                item['toefl_w'] = 20
            elif item['ielts_desc'] == "Profile F":
                item['ielts'] = 6.0
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.0
                item['ielts_w'] = 6.0
                item['toefl'] = 86
                item['toefl_l'] = 20
                item['toefl_r'] = 20
                item['toefl_s'] = 20
                item['toefl_w'] = 23
            elif "Profile" not in item['ielts_desc']:
                ieltsDict = get_ielts(item['ielts_desc'])
                item['ielts'] = ieltsDict.get("IELTS")
                item['ielts_l'] = ieltsDict.get("IELTS_L")
                item['ielts_s'] = ieltsDict.get("IELTS_S")
                item['ielts_r'] = ieltsDict.get("IELTS_R")
                item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #       item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            # 就业    //div[@id='careers']
            career = response.xpath("//div[@id='careers']").extract()
            # print("department = ", department)
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            require_chinese_en = """<h2 id="pgentryreqs">Entry requirements for postgraduate programmes</h2>
<p>You should&nbsp;<a href="/pg-howtoapply/">apply online</a>&nbsp;for all our postgraduate programmes.</p>
<p>To be considered for admission to postgraduate study at the University of Bristol, the minimum requirement for entry is an undergraduate (Bachelor&rsquo;s) degree that is equivalent to a UK Upper Second Class degree (also known as a 2:1). Please refer to the <a href="http://www.bristol.ac.uk/study/postgraduate/admissions-statements/%20%20%20" target="_blank">Postgraduate Admissions Statements</a> for each programme for individual entry requirements.</p>
<ul>
<li>Applicants who hold a 4-year Bachelor's (Honours) degree from a prestigious university with a minimum of 80% will be considered for admission to a Master's degree.</li>
<li>Applicants who hold a good Master's degree from a prestigious university will be considered for admission to PhD study.</li>
<li>Applicants will be required to meet the English language requirements for the programme. The profile level requirements can be found on the&nbsp;<a href="http://www.bristol.ac.uk/study/language-requirements/" target="_blank">English language requirements for study</a>&nbsp;page.</li>
</ul>"""
            item["require_chinese_en"] = remove_class(require_chinese_en)
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            # http://www.bristol.ac.uk/study/postgraduate/apply/
            item['apply_proces_en'] = remove_class(clear_lianxu_space(["""<p>We offer an online application system for all of our programmes, except the Postgraduate Certificate in Education for which you should <a href="https://www.ucas.com/ucas/teacher-training/ucas-teacher-training-apply-and-track">apply through UCAS</a>.</p>
<p>You can use our online admissions system to:</p>
<ul>
<li>submit all your application details securely online and view your completed application form;</li>
<li>upload supporting documents;</li>
<li>request references electronically;</li>
<li>track the progress of your application;</li>
<li>receive a decision on your application online;</li>
<li>update your contact details (it is important you tell us if you change your home address or email);</li>
<li>receive useful information about the University and your application.</li>
</ul>
<p>If you are unable to make an online application, please contact the Enquiries team on <a href="mailto:[email protected]">[email protected]</a>.</p>"""]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            apply_documents_en = response.xpath("//h3[contains(text(),'English language requirements')]/preceding-sibling::*[position()<last()]").extract()
            item["apply_documents_en"] = remove_class(clear_lianxu_space(apply_documents_en))
            print("item['apply_documents_en']: ", item['apply_documents_en'])
            yield item
        except Exception as e:
            print("异常：", str(e))
            print("报错链接：", response.url)
            with open("scrapySchool_England/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a+', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")

Пример #21

Показать файл

Файл: UniversityOfStrathclyde_R.py Проект: histudent/python_spider

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.strath.ac.uk/"
        item["university"] = "University of Strathclyde"
        item['url'] = response.url
        # 授课方式
        item['teach_type'] = 'phd'
        # 学位类型
        item['degree_type'] = 3
        item['location'] = "16 Richmond Street, Glasgow, G1 1XQ"
        print("===========================")
        print(response.url)
        try:
            # 学位类型
            degree_type = response.xpath("//main[@id='content']/section[@class='PGtPage']/header[@class='page-summary has-img']/div[@class='wrap']/h1/span/text()").extract()
            item['degree_name'] = ''.join(degree_type).strip()
            print("item['degree_name'] = ", item['degree_name'])

            if "PhD" in item['degree_name']:
                item['teach_type'] = 'phd'
            # 专业名
            programme = response.xpath(
                "//main[@id='content']/section[@class='PGtPage']/header[@class='page-summary has-img']/div[@class='wrap']/h1/text()").extract()
            # print("programme = ", programme)
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en'] = ", item['programme_en'])

            if "Engineering" in item['programme_en']:
                item['department'] = "Faculty of Engineering"
            elif "Science" in item['programme_en']:
                item['department'] = "Faculty of Science"
            elif "Business" in item['programme_en'] or "Finance" in item['programme_en'] or "Marketing" in item['programme_en']:
                item['department'] = "Strathclyde Business School"
            print("item['department'] = ", item['department'])

            # 课程长度、开学时间、截止日期
            durationTeachtime = response.xpath("//b[contains(text(),'Study mode and duration')]/../text()").extract()
            clear_space(durationTeachtime)
            # print("durationTeachtime: ", durationTeachtime)
            durationTeachtimeStr = ''.join(durationTeachtime)

            item['teach_time'] = getTeachTime(durationTeachtimeStr)
            duration_list = getIntDuration(durationTeachtimeStr)
            # print(duration_list)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])
            # print("item['teach_time'] = ", item['teach_time'])

            start_date = response.xpath("//b[contains(text(),'Start date')]/../text()").extract()
            start_date_str = ''.join(start_date).replace(":", "")
            print("start_date_str = ", start_date_str)
            item['start_date'] = getStartDate(start_date_str)
            if item['start_date'] != "" and item['start_date'] > "06" and "2018" not in item['start_date'] and "2019"  not in item['start_date']:
                item['start_date'] = "2018-" + item['start_date']
            elif item['start_date'] != "" and item['start_date'] <= "06" and "2018" not in item['start_date'] and "2019"  not in item['start_date']:
                item['start_date'] = "2019-" + item['start_date']
            # print("item['start_date'] = ", item['start_date'])


            # 截止日期
            deadline = response.xpath("//b[contains(text(),'Application deadline')]/../text()").extract()
            deadline = ''.join(start_date).replace(":", "").strip()
            print("deadline = ", deadline)
            item['deadline'] = getStartDate(deadline)
            print("item['deadline'] = ", item['deadline'])

            # 专业描述
            overview = response.xpath("//article[@id='research-opportunities']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            print("item['overview_en'] = ", item['overview_en'])

            # 课程设置、评估方式
            modules = response.xpath("//h3[contains(text(),'Learning & teaching')]/preceding-sibling::*").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            print("item['modules_en'] = ", item['modules_en'])

            assessment_en = response.xpath("//h3[contains(text(),'Learning & teaching')]/preceding-sibling::*[1]/following-sibling::*").extract()
            item["assessment_en"] = remove_class(clear_lianxu_space(assessment_en))
            # print("item['assessment_en'] = ", item['assessment_en'])

            # 学术要求、英语要求
            rntry_requirements = response.xpath("//article[@id='entry-requirements']//text()").extract()
            item["rntry_requirements"] = clear_lianxu_space(rntry_requirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])

            apply_proces_en = response.xpath("//article[@id='how-can-i-apply']").extract()
            item['apply_proces_en'] = remove_class(clear_lianxu_space(apply_proces_en))
            print("item['apply_proces_en'] = ", item['apply_proces_en'])

            apply = response.xpath("//article[@id='how-can-i-apply']//text()").extract()
            clear_space(apply)
            ielts_re = re.findall(r"IELTS.{1,80}", ''.join(apply))
            # print("ielts_re = ", ielts_re)
            item["ielts_desc"] = ''.join(ielts_re)
            print("item['ielts_desc'] = ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            # print(ieltlsrw)
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            if item['ielts'] != None:
                item['ielts'] = item['ielts'].strip('.').strip()
            if item['ielts_l'] != None:
                item['ielts_l'] = item['ielts_l'] .strip('.').strip()
            if item['ielts_s'] != None:
                item['ielts_s'] = item['ielts_s'].strip('.').strip()
            if item['ielts_r'] != None:
                item['ielts_r'] = item['ielts_r'].strip('.').strip()
            if item['ielts_w'] != None:
                item['ielts_w'] = item['ielts_w'] .strip('.').strip()
            print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                  %(item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            # 学费    //article[@id='fees-and-funding']/ul[3]/li
            tuition_fee = response.xpath("//html//article[@id='fees-and-funding']/*[contains(text(),'International')]/following-sibling::*[1]//text()").extract()
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"£[\d,]+", ''.join(tuition_fee))
            # print(tuition_fee_re)
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = ''.join(tuition_fee_re[0]).replace("£", "").replace(",", "")
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee'] = ", item['tuition_fee'])


            # 就业    //article[@id='careers']
            career = response.xpath("//article[@id='support-and-development']").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            print("item['career_en'] = ", item['career_en'])

            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt", 'a+', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #22

Показать файл

Файл: UniversityOfSalford_R.py Проект: histudent/python_spider

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.salford.ac.uk/"
        item['university'] = "University of Salford"
        item['url'] = response.url
        item['teach_type'] = 'phd'
        # 学位类型
        item['degree_type'] = 3
        item['location'] = 'The Crescent, Salford, M5 4WT, UK'
        print("===========================")
        print(response.url)
        try:
            # 专业、学位类型
            programme = response.xpath(
                "//div[@id='content']/div[@class='col-md-12']/div[@class='course-title']/div[@class='row']/div[@class='col-sm-8 col-md-8']/h1//text()"
            ).extract()
            item['programme_en'] = ''.join(programme)
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath(
                "//div[@id='content']/div[@class='col-md-12']/div[@class='course-title']/div[@class='row']/div[@class='col-sm-8 col-md-8']/h2//text()"
            ).extract()
            item['degree_name'] = ''.join(degree_type)
            print("item['degree_name']: ", item['degree_name'])

            # //div[@id='content']/div[@class='col-md-12']/div[@class='course-title']/div[@class='row']/div[@class='col-sm-8 col-md-8']/p
            department = response.xpath(
                "//strong[contains(text(), 'School -')]/../text()").extract()
            item['department'] = ''.join(department).strip()
            print("item['department']: ", item['department'])

            start_date = response.xpath(
                "//strong[contains(text(), 'Start Date(s):')]/../text()"
            ).extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            item['start_date'] = getStartDate(''.join(start_date))
            # print("item['start_date']: ", item['start_date'])

            duration = response.xpath(
                "//strong[contains(text(), 'Duration')]/../following-sibling::*[position()<3]//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_str = ''.join(duration)

            item['teach_time'] = getTeachTime(duration_str)
            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['teach_time'] = ", item['teach_time'])
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            if item['department'] == "School of Environment & Life Sciences" or item[
                    'department'] == "School of Computing, Science & Engineering" or item[
                        'department'] == "School of the Built Environment" or item[
                            'department'] == "School of Health Sciences":
                item['tuition_fee'] = 13680
                item['tuition_fee_pre'] = "£"
            elif item['department'] == "School of Arts & Media":
                item['tuition_fee'] = 12490
                item['tuition_fee_pre'] = "£"
            elif item['department'] == "Salford Business School":
                item['tuition_fee'] = 12990
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //div[@id='content']/div[@class='col-md-12']/div[@class='row']/div[1]
            overview = response.xpath(
                "//div[@id='content']/div[@class='col-md-12']/div[@class='row']/div[1] | //div[@id='content']/div[@class='row']/div[1]"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            # //section[@id='about']/div[@id='content']
            # modules_en = response.xpath("//div[@id='courseaccordion']").extract()
            # if len(modules_en) == 0:
            #     # print("********")
            #     modules_en = response.xpath("//h2[contains(text(),'Course Details')]/following-sibling::*").extract()
            # item['modules_en'] = remove_class(clear_lianxu_space(modules_en)) # .replace("&nbsp;", "")
            # item['modules_en'] = item['modules_en'].encode('utf-8').decode("unicode-escape").replace("Â ", "")
            # print("item['modules_en']: ", item['modules_en'])

            # //section[@id='requirements']/div
            entry_requirements = response.xpath(
                "//section[@id='requirements']/div//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            # 申请材料
            apply_documents_en = response.xpath(
                "//h3[contains(text(),'Applicant Profile')]/preceding-sibling::*[1]/following-sibling::*[position()<5]"
            ).extract()
            item['apply_documents_en'] = remove_class(
                clear_lianxu_space(apply_documents_en)
            ).replace(
                "<h3>International Students - Academic Technology Approval Scheme (ATAS)</h3>",
                "").strip()
            # print("item['apply_documents_en']: ", item['apply_documents_en'])

            # //h3[contains(text(),'English Language Requirements')]/following-sibling::*[1]
            ielts_desc = response.xpath(
                "//*[contains(text(),'IELTS')]//text()").extract()
            clear_space(ielts_desc)
            item['ielts_desc'] = ''.join(ielts_desc).replace(
                "Suitable For", "").strip()
            # print("item['ielts_desc']: ",item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            # //section[@id='teaching']/div[@class='container main']/div[@class='col-md-12']/div[@id='teaching_0a19']
            assessment_en = response.xpath(
                "//h3[contains(text(),'Assessment Links')]/preceding-sibling::*[1]/following-sibling::*"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            # //section[@id='employability']/div[@class='container main']/div[@class='col-md-12']/div[@id='employ_0a19']
            career = response.xpath(
                "//section[@id='employability']/div").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div id="content_div_43747">
<h1>Applying for a research degree</h1><p>To apply for your postgraduate research place, you will need to complete our online application form. You will need to have at hand your supporting documents ready to upload when you start the online application. We have four entry points: October, January, April and July. From September 2018, this will change to three entry points, in January, May and September.&nbsp;For the Salford DBA, there are two entry points: April and September.</p><p>Please submit your application with a minimum of six weeks before the date you are aiming to register.</p><ul><li>Degree certificates</li><li>Transcripts</li><li><a href="http://www.salford.ac.uk/__data/assets/pdf_file/0018/104841/18-02-23-Vouch-List-Equivalent-qualifications-to-English-GCSE-Grade-C.pdf">English language qualifications</a></li><li><a href="http://www.advice.salford.ac.uk/page/visa">Passport details (required for International applicants)</a></li><li><a href="http://www.salford.ac.uk/__data/assets/pdf_file/0003/631686/Writing-a-Research-Proposal-Guidance.pdf" title="How to write a research proposal" target="_blank">Research proposal</a></li></ul><p>If you are applying for a PhD by published works, please go <a href="https://shop.salford.ac.uk/product-catalogue/university-goods-and-services/phd-by-published-works/phd-by-published-works-application-fee">to the online shop to make your payment</a> before completing your application.</p><p>For help preparing a research proposal for the PhD in Business, Management and Law, download our <a href="http://www.salford.ac.uk/__data/assets/pdf_file/0009/1572147/HowtoWriteaResearchProposal2018.pdf" title="PhD Research proposal guidance" target="_blank">Research Proposal Guidance</a>. For the Salford DBA, download our <a href="http://www.salford.ac.uk/__data/assets/pdf_file/0008/1559996/Guidance-on-Writing-a-DBA-Research-Proposal-PDF.pdf" title="Guidance on Writing a DBA Research Proposal" target="_blank">Guidance on Writing a DBA Research Proposal</a>.</p><h2>English Language Requirements</h2><p>If you have not yet taken an English Language test please note that availability of these and the time taken to receive certificates of results can vary depending on the time of year. For further information and to check timescales and availability please visit:</p><p><strong>IELTS</strong> - <a href="http://www.ielts.org/">http://www.ielts.org/</a><br /><strong>Pearson Test of English Academic</strong> - <a href="http://www.pearsonpte.com/testme">www.pearsonpte.com/testme</a></p><p>For details of other English Language tests accepted for the UKVI, please visit:<br /><a href="http://www.ukba.homeoffice.gov.uk/sitecontent/applicationforms/new-approved-english-tests.pdf">http://www.ukba.homeoffice.gov.uk/sitecontent/applicationforms/new-approved-english-tests.pdf</a></p><h2>Guide to submitting your application</h2><ol><li>When you first enter the online application you will be asked to create an account</li><li>You will then receive an email with your login PIN and password</li><li>You can re-enter and complete your application at times convenient to you</li><li>Fill in application details &ndash; using the guidance within the form</li><li>Upload your supporting documents</li><li>Once you have submitted your application you can print a copy of your application. However you cannot re-enter and make any changes at this stage</li></ol><h3>What happens next?</h3><ul><li>When you submit your online application you will receive and acknowledgement by email</li><li>You&rsquo;ll be notified of the outcome of your application in writing.</li><li>If you have any questions about the progress of your application please <a href="mailto:[email protected]">email admissions</a></li></ul><h3>Relevant work experience</h3><p><strong>We try to make applying to Salford as flexible and straightforward as possible.</strong></p><p>We&rsquo;re not just interested in exams you&rsquo;ve passed and certificates you&rsquo;ve collected. If you&rsquo;ve gained enough relevant work experience &ndash; paid or voluntary &ndash; we&rsquo;ll take that into account through our Accreditation of Prior Learning (APL) and Accreditation of PriorExperiential Learning (APEL) schemes.</p><h2>How to prepare a research proposal</h2><p>The research proposal is a crucial part of your application.</p><p>You should discuss your proposal with the <strong>Postgraduate Research Admissions Contact</strong> of the School to which you are applying, to make sure you understand what is expected in your subject area.</p><p>For help preparing a research proposal for the PhD in Business, Management and Law, download our <a href="http://www.salford.ac.uk/__data/assets/pdf_file/0009/1572147/HowtoWriteaResearchProposal2018.pdf" title="PhD Research proposal guidance" target="_blank">Research Proposal Guidance</a>. For the Salford DBA, download our <a href="http://www.salford.ac.uk/__data/assets/pdf_file/0008/1559996/Guidance-on-Writing-a-DBA-Research-Proposal-PDF.pdf" title="Guidance on Writing a DBA Research Proposal" target="_blank">Guidance on Writing a DBA Research Proposal</a>.&nbsp;</p><p>When submitting an application, make sure that the specialist area you wish to study is covered by a member of staff at the University:</p><ul><li>Check individual staff entries on the <a href="http://www.salford.ac.uk/research/research-centres">Research Centre sites</a> that relate to your area</li><li>Explore <a href="http://www.seek.salford.ac.uk/">staff profiles</a> and check current research interests</li><li>Take note of the relevant Research Administrator listed below you will need it when completing your online application</li></ul>
</div>
"""
                ]))
            print("item['apply_proces_en']: ", item['apply_proces_en'])

            item['require_chinese_en'] = remove_class(
                clear_lianxu_space([
                    """<p><strong>Postgraduate</strong></p><p>(4 year) Bachelor degrees with a GPA 2.7/4.0 or 70% from a National University; or from a Project 211 University with a GPA 2.6/4.0 or 65%; or from a Private University with GPA 2.75/4.0 or 75%.</p>"""
                ]))
            print("item['require_chinese_en']: ", item['require_chinese_en'])
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #23

Показать файл

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "https://www.uclan.ac.uk/"
        item['university'] = "University of Central Lancashire"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        # item['location'] = 'Hope Park, Liverpool, L16 9JD'
        print("===========================")
        print(response.url)
        try:
            # 专业、学位类型
            programme = response.xpath(
                "//div[@id='TopGraphic']/div[@class='twelvecol last']/h2/text()"
            ).extract()
            if len(programme) == 0:
                programme = response.xpath(
                    "//div[@class='marketing-version']/div[@class='course-title']/h1/text()"
                ).extract()
            clear_space(programme)
            item['programme_en'] = ''.join(programme)
            print("item['programme_en']: ", item['programme_en'])

            degree_type = response.xpath(
                "//div[@id='TopGraphic']/div[@class='twelvecol last']/h2/span/text()"
            ).extract()
            if len(degree_type) == 0:
                degree_type = response.xpath(
                    "//div[@class='marketing-version']/div[@class='course-title']/h1/span/text()"
                ).extract()
            clear_space(degree_type)
            item['degree_name'] = ''.join(degree_type)
            print("item['degree_name']: ", item['degree_name'])

            department = response.xpath(
                "//div[@id='TopGraphic']/div[@class='twelvecol last']/h4//text()"
            ).extract()
            item['department'] = ''.join(department)
            # print("item['department']: ", item['department'])

            duration = response.xpath(
                "//h4[contains(text(), 'Duration:')]/..//text()").extract()
            clear_space(duration)
            print("duration: ", duration)
            duration_str = ''.join(duration)

            item['teach_time'] = getTeachTime(duration_str)
            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            print("item['teach_time'] = ", item['teach_time'])
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            teach_time = response.xpath(
                "//strong[contains(text(),'Full-time:')]/..//text()").extract(
                )
            clear_space(teach_time)
            print("teach_time: ", teach_time)
            item['other'] = ','.join(teach_time)
            if ''.join(teach_time).strip() == "Full-time:" or teach_time[teach_time.index("Full-time:")+1] == "N/A" or\
                    teach_time[teach_time.index("Full-time:")+1] == "" or "part-time" in item['programme_en']:
                item['teach_time'] = "parttime"
            elif item['teach_time'] == "":
                item['teach_time'] = "fulltime"
            print("item['teach_time'] = ", item['teach_time'])

            location = response.xpath(
                "//h4[contains(text(), 'Campus')]/following-sibling::p[1]//text()"
            ).extract()
            item['location'] = ''.join(location)
            # print("item['location']", item['location'])

            start_date = response.xpath(
                "//h4[contains(text(), 'Start Date:')]/following-sibling::p[1]//text()"
            ).extract()
            # print(start_date)
            item['start_date'] = getStartDate(''.join(start_date))
            # print("item['start_date']", item['start_date'])

            overview = response.xpath(
                "//div[@id='FullCourse']/div[@class='eightcol']/div[@class='sixcol last']"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']", item['overview_en'])

            # //div[@id='EntryReq']
            entry_requirements = response.xpath(
                "//div[@id='EntryReq']//text()").extract()
            entry_requirements_str = ''.join(entry_requirements).strip()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']", item['rntry_requirements'])

            # ielts = response.xpath("//div[@id='EntryReq']//p[last()-1]//text() | //div[@id='EntryReq']//ul[last()]//text()").extract()
            # clear_space(ielts)

            # //div[@id='caag']
            modules = response.xpath("//div[@id='caag']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']", item['modules_en'])

            # //h3[contains(text(),'Learning Environment and Assessment')]/..
            assessment_en = response.xpath(
                "//h3[contains(text(),'Learning Environment and Assessment')]/.."
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']", item['assessment_en'])

            # //div[@class='ug-course-2017']/div[@class='container gap-bottom'][2]/div[@class='row']/div[@class='twelvecol last']/div
            career_en = response.xpath(
                "//h3[contains(text(),'Graduate Careers')]/..|//h3[contains(text(),'Opportunities')]/.."
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en']", item['career_en'])

            # //h3[@id='applynow']/..
            apply_proces_en = response.xpath(
                "//h3[@id='applynow']/..").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(apply_proces_en))
            # print("item['apply_proces_en']", item['apply_proces_en'])

            # https://www.uclan.ac.uk/study_here/fees_and_finance/international_tuition_fees.php#international
            item['tuition_fee'] = '12950'
            if item['department'] == "School of Forensic and Applied Sciences" or item['department'] == "School of Physical Sciences and Computing" \
                    or item['department'] == "School of Pharmacy and Biomedical Sciences" or item['department'] == "School of Engineering":
                item['tuition_fee'] = '13950'
            item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']", item['tuition_fee'])
            # School of Forensic and Applied Sciences
            # School of Physical Sciences and Computing
            # School of Pharmacy and Biomedical Sciences
            # School of Engineering

            ieltsList = re.findall(r'.{1,50}IELTS.{1,80}',
                                   entry_requirements_str)
            print("ieltslist: ", ieltsList)
            item['ielts_desc'] = ''.join(ieltsList)
            # print("item['ielts_desc']", item['ielts_desc'])

            ielts_list = re.findall(
                r"[5-9]\.\d\s|[5-9]\.\d,|[5-9]\.\d\.|[5-9]\.\d$|[5-9]\s|[5-9]\.",
                item['ielts_desc'])
            # print(ielts_list)
            if len(ielts_list) == 1:
                item['ielts'] = ielts_list[0].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_l'] = ielts_list[0].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_s'] = ielts_list[0].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_r'] = ielts_list[0].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_w'] = ielts_list[0].strip().strip('.').replace(
                    ',', '').strip()
            elif len(ielts_list) == 2:
                item['ielts'] = ielts_list[0].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_l'] = ielts_list[1].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_s'] = ielts_list[1].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_r'] = ielts_list[1].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_w'] = ielts_list[1].strip().strip('.').replace(
                    ',', '').strip()
            elif len(ielts_list) == 3:
                item['ielts'] = ielts_list[0].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_l'] = ielts_list[0].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_s'] = ielts_list[0].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_r'] = ielts_list[1].strip().strip('.').replace(
                    ',', '').strip()
                item['ielts_w'] = ielts_list[2].strip().strip('.').replace(
                    ',', '').strip()
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            item[
                'require_chinese_en'] = "<p>4-year Bachelors degree with grades of 70% or above</p>"
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #24

Показать файл

Файл: SwanseaUniversity_R.py Проект: histudent/python_spider

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "http://www.swansea.ac.uk/"
        item['university'] = "Swansea University"
        item['url'] = response.url
        item['teach_type'] = 'phd'
        item['degree_type'] = 3
        item['location'] = "Singleton Park, Swansea, SA2 8PP, Wales, UK"
        print("===============================")
        print(response.url)
        try:
            # 专业、学位类型
            courseDegreeaward = response.xpath(
                "//h1[@class='content-header-heading']//text()").extract()
            courseDegreeawardStr = ''.join(courseDegreeaward)
            if len(courseDegreeawardStr) != 0:
                d = re.findall(
                    r"^(\w+\s/\w+\s/\w+)|^(\w+/\w+/\w+)|^(\w+/\s\w+)|^(\w+)",
                    courseDegreeawardStr)
                if len(d) != 0:
                    degree_type = ''.join(list(d)[0])
                    # print(degree_type)
                    item['degree_name'] = degree_type
                    programme = courseDegreeawardStr.split(degree_type)
                    item['programme_en'] = ''.join(programme).strip()
            print("item['degree_name'] = ", item['degree_name'])
            print("item['programme_en'] = ", item['programme_en'])

            # print("courseDegreeawardStr = ", courseDegreeawardStr)
            departmentDict = {
                "Ancient Egyptian Culture":
                "College of Arts and Humanities",
                "Ancient History and Classical Culture":
                "College of Arts and Humanities",
                "Ancient Narrative Literature":
                "College of Arts and Humanities",
                "Classics":
                "College of Arts and Humanities",
                "Chinese-English Translation & Language Teaching":
                "College of Arts and Humanities",
                "Teaching English to Speakers of Other Languages  (TESOL)":
                "College of Arts and Humanities",
                "Creative Writing":
                "College of Arts and Humanities",
                "English Literature":
                "College of Arts and Humanities",
                "Gender and Culture":
                "College of Arts and Humanities",
                "Welsh Writing in English":
                "College of Arts and Humanities",
                "Early Modern History":
                "College of Arts and Humanities",
                "History":
                "College of Arts and Humanities",
                "Medieval Studies":
                "College of Arts and Humanities",
                "Modern History":
                "College of Arts and Humanities",
                "Public History and Heritage":
                "College of Arts and Humanities",
                "Public History and Heritage (extended)":
                "College of Arts and Humanities",
                "Professional Translation":
                "College of Arts and Humanities",
                "Professional Translation (Extended)":
                "College of Arts and Humanities",
                "Translation and Interpreting":
                "College of Arts and Humanities",
                "Translation and Interpreting (Extended)":
                "College of Arts and Humanities",
                "Postgraduate Certificate in Translation Technology":
                "College of Arts and Humanities",
                "Communication, Media Practice and PR":
                "College of Arts and Humanities",
                "International Journalism":
                "College of Arts and Humanities",
                "Digital Media":
                "College of Arts and Humanities",
                "Erasmus Mundus Journalism, Media and Globalisation":
                "College of Arts and Humanities",
                "Development and Human Rights":
                "College of Arts and Humanities",
                "Gender and Culture":
                "College of Arts and Humanities",
                "International Relations":
                "College of Arts and Humanities",
                "International Security & Development":
                "College of Arts and Humanities",
                "Politics":
                "College of Arts and Humanities",
                "Public Policy":
                "College of Arts and Humanities",
                "War and Society":
                "College of Arts and Humanities",
                "BEng Aerospace Engineering":
                "College of Engineering",
                "MEng Aerospace Engineering":
                "College of Engineering",
                "BEng Aerospace Engineering (with a Year in Industry)":
                "College of Engineering",
                "MEng Aerospace Engineering (with a Year in Industry)":
                "College of Engineering",
                "BEng Aerospace Engineering (with a Year Abroad)":
                "College of Engineering",
                "MEng Aerospace Engineering (with a Year Abroad)":
                "College of Engineering",
                "Aerospace Engineering Foundation Year":
                "College of Engineering",
                "MSc Aerospace Engineering":
                "College of Engineering",
                "MSc by Research in Aerospace Engineering":
                "College of Engineering",
                "MSc Engineering Leadership & Management":
                "College of Engineering",
                "Engineering Doctorate (EngD)":
                "College of Engineering",
                "PhD or MPhil Aerospace Engineering":
                "College of Engineering",
                "BEng Chemical Engineering":
                "College of Engineering",
                "MEng Chemical Engineering":
                "College of Engineering",
                "BEng Chemical Engineering (with a Year in Industry)":
                "College of Engineering",
                "MEng Chemical Engineering (with a Year in Industry)":
                "College of Engineering",
                "BEng Chemical Engineering (with a Year Abroad)":
                "College of Engineering",
                "MEng Chemical Engineering (with a Year Abroad)":
                "College of Engineering",
                "Chemical Engineering Foundation Year":
                "College of Engineering",
                "MSc Chemical Engineering":
                "College of Engineering",
                "MSc Engineering Leadership & Management":
                "College of Engineering",
                "MSc by Research in Chemical Engineering":
                "College of Engineering",
                "MSc by Research in Bio-process Engineering":
                "College of Engineering",
                "MSc by Research in Desalination and Water Re-use":
                "College of Engineering",
                "MSc by Research in Fuel Technology":
                "College of Engineering",
                "MSc by Research in Membrane Technology":
                "College of Engineering",
                "PhD or MPhil Chemical Engineering":
                "College of Engineering",
                "Engineering Doctorate (EngD)":
                "College of Engineering",
                "BEng Civil Engineering":
                "College of Engineering",
                "MEng Civil Engineering":
                "College of Engineering",
                "BEng Civil Engineering (with a Year in Industry)":
                "College of Engineering",
                "MEng Civil Engineering (with a Year in Industry)":
                "College of Engineering",
                "BEng Civil Engineering (with a Year Abroad)":
                "College of Engineering",
                "MEng Civil Engineering (with a Year Abroad)":
                "College of Engineering",
                "MSc Civil Engineering":
                "College of Engineering",
                "Erasmus Mundus MSc in Computational Mechanics":
                "College of Engineering",
                "MSc Computer Modelling and Finite Elements in Engineering Mechanics":
                "College of Engineering",
                "MSc Engineering Leadership & Management":
                "College of Engineering",
                "MRes Computer Modelling in Engineering":
                "College of Engineering",
                "MSc by Research in Civil Engineering":
                "College of Engineering",
                "PhD Computational Mechanics":
                "College of Engineering",
                "PhD or MPhil Civil Engineering":
                "College of Engineering",
                "Engineering Doctorate (EngD)":
                "College of Engineering",
                "BEng Electronic and Electrical Engineering":
                "College of Engineering",
                "MEng Electronic and Electrical Engineering":
                "College of Engineering",
                "BEng Electronic and Electrical Engineering (with a year in Europe, N. America, Australia or industry)":
                "College of Engineering",
                "MEng Electronic and Electrical Engineering (with a year in Europe, N. America, Australia or industry)":
                "College of Engineering",
                "Electronic and Electrical Engineering Foundation Year":
                "College of Engineering",
                "MSc Communications Engineering":
                "College of Engineering",
                "MSc Electronic and Electrical Engineering":
                "College of Engineering",
                "MSc Power Engineering and Sustainable Energy":
                "College of Engineering",
                "MSc Nanoscience to Nanotechnology":
                "College of Engineering",
                "MSc by Research in Electronic and Electrical Engineering":
                "College of Engineering",
                "MSc Engineering Leadership & Management":
                "College of Engineering",
                "PhD or MPhil Electronic and Electrical Engineering":
                "College of Engineering",
                "Erasmus Mundus MSc in Computational Mechanics":
                "College of Engineering",
                "MSc Computer Modelling and Finite Elements in Engineering Mechanics":
                "College of Engineering",
                "MSc Engineering Leadership & Management":
                "College of Engineering",
                "MRes Computer Modelling in Engineering":
                "College of Engineering",
                "PhD Computational Mechanics":
                "College of Engineering",
                "PhD or MPhil Civil Engineering":
                "College of Engineering",
                "Engineering Doctorate (EngD)":
                "College of Engineering",
                "BEng Materials Science and Engineering":
                "College of Engineering",
                "MEng Materials Science and Engineering":
                "College of Engineering",
                "BEng Materials Science and Engineering (with a Year in Industry)":
                "College of Engineering",
                "MEng Materials Science and Engineering (with a Year in Industry)":
                "College of Engineering",
                "BEng Materials Science and Engineering (with a Year Abroad)":
                "College of Engineering",
                "MEng Materials Science and Engineering (with a Year Abroad)":
                "College of Engineering",
                "Materials Science and Engineering Foundation Year":
                "College of Engineering",
                "MSc Materials Engineering":
                "College of Engineering",
                "MSc Engineering Leadership & Management":
                "College of Engineering",
                "MSc by Research in Materials Engineering":
                "College of Engineering",
                "Engineering Doctorate (EngD)":
                "College of Engineering",
                "PhD or MPhil Materials Engineering":
                "College of Engineering",
                "BEng Mechanical Engineering":
                "College of Engineering",
                "MEng Mechanical Engineering":
                "College of Engineering",
                "BEng Mechanical Engineering (with a Year in Industry)":
                "College of Engineering",
                "MEng Mechanical Engineering (with a Year in Industry)":
                "College of Engineering",
                "BEng Mechanical Engineering (with a Year Abroad)":
                "College of Engineering",
                "MEng Mechanical Engineering (with a Year Abroad)":
                "College of Engineering",
                "Mechanical Engineering Foundation Year":
                "College of Engineering",
                "MSc Mechanical Engineering":
                "College of Engineering",
                "MSc Engineering Leadership & Management":
                "College of Engineering",
                "MSc by Research in Mechanical Engineering":
                "College of Engineering",
                "PhD or MPhil Mechanical Engineering":
                "College of Engineering",
                "Engineering Doctorate (EngD)":
                "College of Engineering",
                "MSc Nanoscience to Nanotechnology":
                "College of Engineering",
                "MSc Engineering Leadership & Management":
                "College of Engineering",
                "MSc by Research Nanotechnology":
                "College of Engineering",
                "PhD or MPhil Nanotechnology":
                "College of Engineering",
                "Engineering Doctorate (EngD)":
                "College of Engineering",
                "Zienkiewicz Centre for Computational Engineering (ZCCE)":
                "College of Engineering",
                "Materials Research Centre (MRC)":
                "College of Engineering",
                "Systems and Process Engineering Centre (SPEC)":
                "College of Engineering",
                "Applied Sports, Technology, Exercise and Medicine (A-STEM)":
                "College of Engineering",
                "MSc/PGCert/PGDip Gerontology and Ageing Studies":
                "College of Human and Health Sciences",
                "MSc International Gerontology and Ageing Studies":
                "College of Human and Health Sciences",
                "MA/PGDip/PGCert Childhood Studies":
                "College of Human and Health Sciences",
                "MA/PGDip/PGCert Developmental and Therapeutic Play":
                "College of Human and Health Sciences",
                "PGCert Enhanced Neonatal Care":
                "College of Human and Health Sciences",
                "MSc/PGDip/PGCert Child Public Health":
                "College of Human and Health Sciences",
                "MA/PGDip/PGCert Education for Health Professions":
                "College of Human and Health Sciences",
                "MSc/PGDip Advanced Critical Care Practice":
                "College of Human and Health Sciences",
                "MSc/PGDip Advanced Practice in Health Care":
                "College of Human and Health Sciences",
                "MSc/PGDip/PGCert Advanced Specialist Blood Transfusion Practice":
                "College of Human and Health Sciences",
                "PGCert Approved Mental Health Professional":
                "College of Human and Health Sciences",
                "PGCert Blood Component Transfusion":
                "College of Human and Health Sciences",
                "MSc/PgD/PgC Community and Primary Health Care Practice":
                "College of Human and Health Sciences",
                "MSc/PGDip/PGCert Enhanced Professional Practice":
                "College of Human and Health Sciences",
                "MSc/PGDip Enhanced Professional Midwifery Practice":
                "College of Human and Health Sciences",
                "MSc Long Term and Chronic Conditions Management":
                "College of Human and Health Sciences",
                "MA Medical Law and Ethics":
                "College of Human and Health Sciences",
                "PGCert Non-Medical Prescribing for Nurses and Midwives":
                "College of Human and Health Sciences",
                "PGCert Non-Medical Prescribing for Allied Health Professionals":
                "College of Human and Health Sciences",
                "PGCert Non-Medical Prescribing for Pharmacists":
                "College of Human and Health Sciences",
                "MSc Nursing Pre-Registration (Adult)":
                "College of Human and Health Sciences",
                "MSc Nursing Pre-Registration (Child)":
                "College of Human and Health Sciences",
                "MSc Nursing Pre-Registration (Mental Health)":
                "College of Human and Health Sciences",
                "MSc/PgD Public Health & Health Promotion":
                "College of Human and Health Sciences",
                "MSc Social Work":
                "College of Human and Health Sciences",
                "MSc Health Care Management":
                "College of Human and Health Sciences",
                "MSc Leadership, Management and Innovation in Health Care":
                "College of Human and Health Sciences",
                "MSc Abnormal and Clinical Psychology":
                "College of Human and Health Sciences",
                "MSc Cognitive Neuroscience":
                "College of Human and Health Sciences",
                "LLM in LegalTech":
                "Hillary Rodham Clinton School of Law",
                "LLM in Human Rights":
                "Hillary Rodham Clinton School of Law",
                "LLM Intellectual Property & Commercial Practice":
                "Hillary Rodham Clinton School of Law",
                "LLM in International Commercial Law":
                "Hillary Rodham Clinton School of Law",
                "LLM in International Commercial and Maritime Law":
                "Hillary Rodham Clinton School of Law",
                "LLM in International Maritime Law":
                "Hillary Rodham Clinton School of Law",
                "LLM in International Trade Law":
                "Hillary Rodham Clinton School of Law",
                "LLM in Legal Practice and Advanced Drafting":
                "Hillary Rodham Clinton School of Law",
                "LLM in Oil, Gas and Renewable Energy Law":
                "Hillary Rodham Clinton School of Law",
                "Law PhD/MPhil":
                "Hillary Rodham Clinton School of Law",
                "Graduate Diploma in Law":
                "Hillary Rodham Clinton School of Law",
                "Legal Practice Course":
                "Hillary Rodham Clinton School of Law",
                "LLM in Legal Practice and Advanced Drafting":
                "Hillary Rodham Clinton School of Law",
                "MSc Environmental Dynamics and Climate Change":
                "College of Science",
                "MSc Geographic Information and Climate Change":
                "College of Science",
                "MSc High Performance and Scientific Computing":
                "College of Science",
                "MSc by Research in Earth Observation":
                "College of Science",
                "MSc by Research in Environmental Dynamics":
                "College of Science",
                "MSc by Research in Glaciology":
                "College of Science",
                "MSc by Research in Global Environmental Modelling":
                "College of Science",
                "MSc by Research in Global Migration":
                "College of Science",
                "MSc by Research in Media Geographies":
                "College of Science",
                "MSc by Research in Social Theory and Space":
                "College of Science",
                "MSc by Research in Urban Studies":
                "College of Science",
                "PhD/MPhil Human Geography":
                "College of Science",
                "PhD/MPhil Physical Geography":
                "College of Science",
                "MSc Maths & Computing for Finance":
                "College of Science",
                "MSc Mathematics":
                "College of Science",
                "MRes Stochastic Processes: Theory and Application":
                "College of Science",
                "MSc by Research in Mathematics":
                "College of Science",
                "PhD/MPhil Mathematics":
                "College of Science",
                "MSc High Performance and Scientific Computing":
                "College of Science",
                "Antimatter Physics":
                "College of Science",
                "Cold Atoms and Quantum Optics":
                "College of Science",
                "Laser Physics":
                "College of Science",
                "Lattice Gauge Theory":
                "College of Science",
                "Nanotechnology":
                "College of Science",
                "Quantum Fields & Strings":
                "College of Science",
                "Theoretical Particle Physics":
                "College of Science",
                "PhD/MPhil Physics":
                "College of Science",
                "PhD / MSc by Research Chemistry":
                "College of Science",
                "MSc Computer Science":
                "College of Science",
                "MSc Advanced Computer Science":
                "College of Science",
                "MSc Advanced Software Technology":
                "College of Science",
                "MSc High Performance and Scientific Computing":
                "College of Science",
                "MSc Data Science":
                "College of Science",
                "MSc Computer Science: Informatique (Swansea route)":
                "College of Science",
                "MSc Computer Science: Informatique (Grenoble route)":
                "College of Science",
                "MSc by Research in Human Computer Interaction":
                "College of Science",
                "MSc by Research in Theoretical Computer Science":
                "College of Science",
                "MSc by Research in Visual and Interactive Computing":
                "College of Science",
                "MRes Computing and Future Interaction Technologies":
                "College of Science",
                "MRes Visual Computing":
                "College of Science",
                "MRes Logic and Computation":
                "College of Science",
                "PhD/MPhil/MSc by Research in Computer Science":
                "College of Science",
                "MSc Environmental Biology: Conservation and Resource Management":
                "College of Science",
                "MSc High Performance and Scientific Computing":
                "College of Science",
                "MRes Biosciences":
                "College of Science",
                "PhD/MPhil Biological Sciences":
                "College of Science",
                "MSc Accounting & Finance":
                "School of Management",
                "MSc Financial Management":
                "School of Management",
                "MSc Finance and Business Analytics":
                "School of Management",
                "MSc Finance":
                "School of Management",
                "MSc International Banking & Finance":
                "School of Management",
                "MSc Investment Management":
                "School of Management",
                "MSc Strategic Accounting":
                "School of Management",
                "Generalist MSc Management":
                "School of Management",
                "Marketing":
                "School of Management",
                "Finance ":
                "School of Management",
                "Human Resource Management":
                "School of Management",
                "Entrepreneurship ":
                "School of Management",
                "Operations & Supply Management":
                "School of Management",
                "International Management":
                "School of Management",
                "International Standards":
                "School of Management",
                "Business Analytics":
                "School of Management",
                "E-Business":
                "School of Management",
                "Tourism ":
                "School of Management",
                "MSc Economics":
                "School of Management",
                "MSc Economics & Finance":
                "School of Management",
                "Strategic Marketing":
                "School of Management",
                "MSc Management (Marketing)":
                "School of Management",
                "MSc Clinical Medicine":
                "Swansea University Medical School",
                "MSc Clinical Science (Medical Physics)":
                "Swansea University Medical School",
                "MSc Diabetes Practice (Distance Learning)":
                "Swansea University Medical School",
                "MSc Genomic Medicine":
                "Swansea University Medical School",
                "MSc Medical Radiation Physics":
                "Swansea University Medical School",
                "MSc Nanomedicine":
                "Swansea University Medical School",
                "PG Dip Physician Associate Studies":
                "Swansea University Medical School",
                "MSc Applied Analytical Science (LCMS)":
                "Swansea University Medical School",
                "MSc Autism and Related Conditions":
                "Swansea University Medical School",
                "MSc Health Data Science":
                "Swansea University Medical School",
                "MSc Health Informatics":
                "Swansea University Medical School",
                "MSc Leadership for the Health Professions (Distance Learning)":
                "Swansea University Medical School",
                "MRes Applied Analytical Science (LCMS)":
                "Swansea University Medical School",
                "MRes Health Informatics":
                "Swansea University Medical School",
                "MRes Research in Health Professions Education":
                "Swansea University Medical School",
                "MSc Research Methods in Psychology":
                "College of Human and Health Sciences",
                "MSc Social Research Methods":
                "College of Human and Health Sciences",
            }
            item['department'] = departmentDict.get(courseDegreeawardStr)
            if item['department'] == None:
                item['department'] = departmentDict.get(
                    courseDegreeawardStr.replace(" ", ""))
                if item['department'] == None:
                    item['department'] = departmentDict.get(
                        item['programme_en'])
            print("item['department'] = ", item['department'])

            # //ul[@style='width: 5000px;']/li[4]
            department = response.xpath(
                "//div[@class='breadCrumb module']//ul/li[4]//text()").extract(
                )
            clear_space(department)
            item['department'] = ''.join(department).strip()
            print("item['department'] = ", item['department'])

            # 课程长度
            duration = response.xpath(
                "//table[@class='top-button-course-variants-table']//tr[1]/td[2]//text()|//div[@class='top-button-duration']/div[@class='top-button-duration-value']/text()"
            ).extract()
            clear_space(duration)
            duration = ''.join(duration).strip()
            item['teach_time'] = getTeachTime(duration)

            p_l = ['Yr', 'yrs', 'yr', 'YR']
            for p in p_l:
                if p in duration:
                    item['duration'] = int(duration.replace(p, ""))
                    item['duration_per'] = 1
                    break

            print("item['duration'] = ", item['duration'])
            print("item['duration_per'] = ", item['duration_per'])

            # 专业描述
            overview1 = response.xpath(
                "//div[@id='content-items']/div[@class='layout-article-items']/div[@class='title-and-body-text']"
            ).extract()
            # print(overview1)
            overview2 = response.xpath("//div[@id='key-features']").extract()
            overview3 = response.xpath("//div[@id='description']").extract()
            clear_lianxu_space(overview1)
            clear_lianxu_space(overview2)
            clear_lianxu_space(overview3)
            overview = '\n'.join(overview1).strip() + '\n'.join(
                overview2).strip() + '\n'.join(overview3).strip()
            item['overview_en'] = remove_class(overview)
            print("item['overview_en'] = ", item['overview_en'])

            # 课程设置
            modules_1 = response.xpath(
                "//div[@class='ppsm-ms']//div[@class='variant']")
            # print("modules_1: ", modules_1)
            modules = []
            for m in modules_1:
                modules_year = m.xpath("./h3").extract()
                # print("modules_year: ", modules_year)
                modules.append(''.join(modules_year))
                modules_term = m.xpath("./h4").extract()
                # print("modules_term: ", modules_term)
                if len(modules_term) > 0:
                    for t in range(1, len(modules_term) + 1):
                        # print("modules_term: ", modules_term[t-1])
                        modules.append(modules_term[t - 1])
                        modules_name = m.xpath(
                            "./h4[" + str(t) +
                            "]/following-sibling::div[1]//table//tr/td[4]"
                        ).extract()
                        # print("modules_name: ", modules_name)
                        modules.append(''.join(modules_name))
            # print(modules)
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            print("item['modules_en'] = ", item['modules_en'])

            # IELTS
            entryRequirements = response.xpath(
                "//div[@id='entry-requirements']//text()").extract()
            # clear_space(entryRequirements)
            item['rntry_requirements'] = clear_lianxu_space(entryRequirements)
            # print("item['rntry_requirements'] = ", item['rntry_requirements'])
            entryRequirementsStr = ''.join(entryRequirements)
            # .{0,100}(IELTS).{0,100}
            # ielts = re.findall(r"\.[a-zA-Z0-9\s.]{0,80}(IELTS)[a-zA-Z0-9\s.\(\))]{0,80}", entryRequirementsStr)
            pat = r"\..{0,100}IELTS.{0,100}"
            re_ielts = re.compile(pat)
            ielts = re_ielts.findall(entryRequirementsStr)
            item['ielts_desc'] = ''.join(ielts).lstrip('.').strip()
            print("item['ielts_desc'] = ", item['ielts_desc'])
            ielts = item['ielts_desc']
            ieltlsrw = re.findall(r"\d\.\d", ielts)
            # print(ieltlsrw)
            if len(ieltlsrw) >= 2:
                item['ielts'] = ieltlsrw[0]
                item['ielts_l'] = ieltlsrw[1]
                item['ielts_s'] = ieltlsrw[1]
                item['ielts_r'] = ieltlsrw[1]
                item['ielts_w'] = ieltlsrw[1]
            elif len(ieltlsrw) == 1:
                item['ielts'] = ieltlsrw[0]
                item['ielts_l'] = ieltlsrw[0]
                item['ielts_s'] = ieltlsrw[0]
                item['ielts_r'] = ieltlsrw[0]
                item['ielts_w'] = ieltlsrw[0]
            else:
                item["ielts"] = None  # float
                item["ielts_l"] = None  # float
                item["ielts_s"] = None  # float
                item["ielts_r"] = None  # float
                item["ielts_w"] = None
            print(
                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            # 学费
            # fee = html.xpath("//div[@id='tuition-fees-contents']/div[@class='table-wrapper']/table[@class='expander-item-fees-table']/tbody/tr[@class='expander-item-fees-table-row odd']/td[@class='expander-item-fees-table-data odd'][2]//text()")
            tuition_fee = response.xpath(
                "//div[@id='tuition-fees-contents']//table[@class='expander-item-fees-table']/tbody/tr[1]/td[4]//text()"
            ).extract()
            clear_space(tuition_fee)
            tuition_fee = ''.join(tuition_fee)
            # print(tuition_fee)
            if "£" in tuition_fee:
                item['tuition_fee'] = int(
                    tuition_fee.replace('£', '').replace(',', ''))
                item['tuition_fee_pre'] = "£"
            print("item['tuition_fee_pre'] = ", item['tuition_fee_pre'])
            print("item['tuition_fee'] = ", item['tuition_fee'])

            # //div[@id='how-to-apply']
            how_to_apply = response.xpath(
                "//div[@id='how-to-apply']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(how_to_apply))
            print("item['apply_proces_en'] = ", item['apply_proces_en'])

            assessment_en = response.xpath("//div[@id='assessment']").extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            print("item['assessment_en'] = ", item['assessment_en'])

            career = response.xpath(
                "//div[@id='careers-and-employability']|//div[@id='careers-employability']|//div[@id='employabilitycareers']|//div[@id='employability-and-careers-']|//div[@id='careers-in-child-nursing-']|//div[@id='careers']|//div[@id='graduate-employability-and-careers']|//div[@id='careers-in-radiotherapy-physics']|//div[@id='careers-in-midwifery']|//div[@id='careers-in-neurophysiology-']|//div[@id='careers-in-psychology-']|//div[@id='careers-in-adult-nursing-']|//div[@id='careers-in-nursing']|//div[@id='career-prospects-']"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            print("item['career_en'] = ", item['career_en'])

            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a+',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #25

Показать файл

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "http://www.herts.ac.uk/"
        item['university'] = "University of Hertfordshire"
        item['url'] = response.url
        item['teach_type'] = 'phd'
        # 学位类型
        item['degree_type'] = 3
        print("===========================")
        print(response.url)
        try:
            # //div[@id='content']/main/div[@class='course-sub-head']/a
            department = response.xpath("//div[@id='content']/main/div[@class='course-sub-head']/a//text()").extract()
            department = ''.join(department).strip()
            item['department'] = department
            print("department: ", department)

            # 专业、学位类型 //div[@id='content']/main/h1
            programmeDegreetype = response.xpath("//div[@id='content']/main/h1//text()").extract()
            # print("programmeDegreetype: ", programmeDegreetype)
            programmeDegreetypeStr = ''.join(programmeDegreetype)
            # print("programmeDegreetypeStr: ", programmeDegreetypeStr)
            degreetype = re.findall(r"Masters\sby\sResearch|^\w+\s", programmeDegreetypeStr)
            # print(degreetype)
            if len(degreetype) != 0:
                degreetype = ''.join(list(degreetype[0]))
                # print(degreetype)
                item['degree_name'] = degreetype
            print("item['degree_name']: ", item['degree_name'])
            programme = programmeDegreetypeStr.replace(''.join(degreetype), '')
            # print(programme)
            item['programme_en'] = ''.join(programme)
            print("item['programme_en']: ", item['programme_en'])

            if item['degree_name'] == "Masters by Research":
                item['teach_type'] = 'research'

            duration = response.xpath(
                "//h3[contains(text(),'Key course information')]/following-sibling::ul//*[contains(text(), 'Full')]//text()").extract()
            clear_space(duration)
            print("duration: ", duration)
            duration_str = ''.join(duration)

            item['teach_time'] = getTeachTime(duration_str)
            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            print("item['teach_time'] = ", item['teach_time'])
            print("item['duration'] = ", item['duration'])
            print("item['duration_per'] = ", item['duration_per'])

            location = response.xpath(
                "//h3[contains(text(),'Key course information')]/following-sibling::ul//*[contains(text(), 'Locations')]/../../following-sibling::*//text()").extract()
            clear_space(location)
            item['location'] = ''.join(location).strip()
            print("item['location'] = ", item['location'])

            # //div[@id='overview']
            overview = response.xpath("//div[@id='overview']").extract()
            overview_en = remove_class(clear_lianxu_space(overview))
            item['overview_en'] = overview_en
            print("item['overview_en']: ", item['overview_en'])

            assessment_en = response.xpath("//h3[contains(text(),'Teaching methods')]/following-sibling::*").extract()
            if len(assessment_en) > 0:
                item['assessment_en'] = "<h3>Teaching methods</h3>" + remove_class(clear_lianxu_space(assessment_en))
            print("item['assessment_en']: ", item['assessment_en'])

            career_en = response.xpath("//h3[contains(text(),'Teaching methods')]/preceding-sibling::*").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            print("item['career_en']: ", item['career_en'])

            modules = response.xpath("//div[@id='modules']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            print("item['modules_en']: ", item['modules_en'])

            # //div[@id='fees']
            feeContent = response.xpath("//h4[contains(text(),'International Students')]/following-sibling::h5[contains(text(), 'Full')]/following-sibling::ul[1]//text()").extract()
            clear_space(feeContent)
            # print("feeContent: ", feeContent)
            feelist = re.findall(r"£[\d,]+", ''.join(feeContent))
            if len(feelist) > 0:
                item['tuition_fee'] = int(feelist[0].replace('£', '').replace(',', '').strip())
                item['tuition_fee_pre'] = '£'
            print("item['tuition_fee']: ", item['tuition_fee'])
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //div[@id='how-to-apply']
            entry_requirements = response.xpath("//h2[contains(text(),'How to apply')]/preceding-sibling::*//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            print("item['rntry_requirements']: ", item['rntry_requirements'])

            # print("entry_requirementsStr: ", entry_requirementsStr)
            ielts = re.findall(r"IELTS[\sa-zA-Z]*\d\.?\d?[\sa-z\(\)]*\d\.?\d?[\sa-z\(\)]{1,100}", item['rntry_requirements'])
            # print("ielts: ", ielts)
            item['ielts_desc'] = ''.join(ielts).strip()
            print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                    item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))


            yield item
        except Exception as e:
            with open(item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #26

Показать файл

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "University of Bolton"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        print("subjectArea===: ", response.meta['subjectArea'])
        try:
            programmeDegreetype = response.xpath(
                "//div[@class='wpb_text_column wpb_content_element  vc_custom_1506499626241']/div[@class='wpb_wrapper']/h2//text()"
            ).extract()
            # print("programmeDegreetype: ", programmeDegreetype)
            programmeDegreetypeStr = ''.join(programmeDegreetype).strip()

            degree_type = response.xpath(
                "//li[@class='iconim award']//b[contains(text(),'Award:')]/..//text()"
            ).extract()
            # print("degree_type: ", degree_type)
            item['degree_name'] = ''.join(degree_type).replace("Award:",
                                                               "").strip()
            # if item['degree_name'] == "":
            #     item['degree_name'] = "**"
            print("item['degree_name']: ", item['degree_name'])

            # if item['degree_name'].lower() == "phd":
            #     item['teach_type'] = 'phd'
            #     item['degree_type'] = 3
            # print("item['teach_type']: ", item['teach_type'])
            # print("item['degree_type']: ", item['degree_type'])

            programme = programmeDegreetypeStr.replace(item['degree_name'],
                                                       '').replace("()",
                                                                   "").strip()
            item['programme_en'] = programme
            # print("item['programme_en']: ", item['programme_en'])

            mode = response.xpath(
                "//b[contains(text(),'Course type:')]/..//text()").extract()
            clear_space(mode)
            item['teach_time'] = getTeachTime(''.join(mode))
            # print("item['teach_time']: ", item['teach_time'])

            start_date = response.xpath(
                "//li[@class='iconim date']//b[contains(text(),'Start date:')]/..//text()"
            ).extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            start_date_str = ''.join(start_date).replace("Start date:",
                                                         "").strip()
            # print("start_date_str: ", start_date_str)
            start_date_re = re.findall(r"\d+/\d+/\d+", start_date_str)
            # print("start_date_re: ", start_date_re)
            if len(start_date_re) > 0:
                for s in start_date_re:
                    start_date_sp = s.split('/')
                    item['start_date'] += start_date_sp[
                        -1] + "-" + start_date_sp[1] + "-" + start_date_sp[
                            0] + ", "
            if item['start_date'] != None:
                item['start_date'] = item['start_date'].strip().rstrip(
                    ',').strip()
            # print("item['start_date']: ", item['start_date'])

            location = response.xpath(
                "//li[@class='iconim location']//b[contains(text(),'Location:')]/..//text()"
            ).extract()
            item['location'] = ''.join(location).replace("Location:",
                                                         "").strip()
            # print("item['location']: ", item['location'])

            duration = response.xpath(
                "//li[@class='iconim duration']//b[contains(text(),'Duration:')]/..//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            overview_en = response.xpath(
                "//div[@id='course-details']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            # print("item['overview_en']: ", item['overview_en'])

            # //div[@id='course-detail']
            entry_requirements = response.xpath(
                "//div[@id='entry-requirements']//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts_desc = response.xpath(
                "//div[@id='entry-requirements']//*[contains(text(),'IELTS')]/text()"
            ).extract()
            clear_space(ielts_desc)
            # print("ielts_desc: ", ielts_desc)
            # ielts_desc_re = re.findall(r'.{1,50}IELTS.{1,50}', ''.join(ielts_desc))
            # print("ielts_desc_re: ", ielts_desc_re)
            # if len(ielts_desc) > 0:
            item['ielts_desc'] = ''.join(ielts_desc).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #     item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            career_en = response.xpath(
                "//div[@id='careers-employment']").extract()
            item['career_en'] = remove_class(
                clear_lianxu_space(career_en)).replace("<div></div>",
                                                       "").strip()
            # print("item['career_en']: ", item['career_en'])

            how_to_apply = response.xpath(
                "//div[@id='how-to-apply']").extract()
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(how_to_apply))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            modules = response.xpath(
                "//div[@class='tab_content modules_tab_content tab__teaching-assessment__modules']"
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            assessment_en = response.xpath(
                "//div[@class='tab_content modules_tab_content tab__teaching-assessment__teaching-methods']"
                "|//div[@class='tab_content modules_tab_content tab__teaching-assessment__assessment-methods']"
            ).extract()
            item['assessment_en'] = remove_class(
                clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            tuition_fee = response.xpath(
                "//h3[@class='table_header'][contains(text(),'International fees')]/following-sibling::div[1]/table//tr/th[contains(text(),'2018/')][1]/following-sibling::td[1]//text()"
            ).extract()
            # print("tuition_fee: ", tuition_fee)
            if len(tuition_fee) > 0:
                item['tuition_fee'] = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee_pre'] = "£"
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            department_dict = {
                "Art & Design and Fine Art":
                "Bolton School of the Arts",
                "Textiles & Fashion":
                "Bolton School of the Arts",
                "Media & Photography":
                "Bolton School of the Arts",
                "Theatre & Performance":
                "Bolton School of the Arts",
                "English & Creative Writing":
                "Bolton School of the Arts",
                "Graphic Design":
                "Bolton School of the Arts",
                "Animation & Illustration":
                "Bolton School of the Arts",
                "Accountancy":
                "Institute of Management Greater Manchester",
                "Business, Retail, Logistics & Supply Chain Management":
                "Institute of Management Greater Manchester",
                "Nursing":
                "Faculty of Health & Wellbeing",
                "Health & Social Care":
                "Faculty of Health & Wellbeing",
                "Dental Sciences":
                "Faculty of Health & Wellbeing",
                "Early Years & Childhood Studies":
                "Faculty of Health & Wellbeing",
                "Community Work & Youth":
                "Faculty of Health & Wellbeing",
                "School of Sport & Biological Sciences":
                "Faculty of Health & Wellbeing",
                "Automotive Design":
                "National Centre for Motorsport Engineering",
                "Chassis Dynamics & Aerodynamics":
                "National Centre for Motorsport Engineering",
                "General Engineering":
                "National Centre for Motorsport Engineering",
                "Motorsport & Trackside Technology":
                "National Centre for Motorsport Engineering",
                "Engines & Performance Modelling":
                "National Centre for Motorsport Engineering",
                "Our Partners":
                "National Centre for Motorsport Engineering",
                "Computing":
                "School of Creative Technologies",
                "Games":
                "School of Creative Technologies",
                "Special & Visual Effects":
                "School of Creative Technologies",
                "Education & Teacher Training":
                "School of Education & Psychology",
                "Psychology":
                "School of Education & Psychology",
                "Access courses":
                "School of Education & Psychology",
                "International Foundation programmes & English Pre-Sessional courses":
                "School of Education & Psychology",
                "Construction":
                "School of Engineering",
                "Civil Engineering":
                "School of Engineering",
                "Mechanical Engineering":
                "School of Engineering",
                "Motorsport & Automotive Performance Engineering":
                "School of Engineering",
                "Biomedical & Medical Engineering":
                "School of Engineering",
                "Electrical & Electronic Engineering":
                "School of Engineering",
                "Mathematics":
                "School of Engineering",
                "Law":
                "School of Law",
                "Centre for Contemporary Coronial Law":
                "School of Law",
                "Medical Biology":
                "School of Sport & Biological Sciences",
                "Sports & Sport Rehabilitation":
                "School of Sport & Biological Sciences",
            }
            item['department'] = department_dict.get(
                response.meta['subjectArea'])
            print("item['department']: ", item['department'])

            item[
                'require_chinese_en'] = "<p><strong>Postgraduate</strong></p><p><em>Taught Postgraduate Programmes:</em></p><p>Bachelor degree from a recognised Chinese university.</p>"

            isup = response.xpath(
                "//a[contains(text(),'Click here for more information on')]//text()"
            ).extract()
            # print("isup: ", isup)
            isup_str = ''.join(isup)
            if len(isup) == 0:
                isup = response.xpath(
                    "//li[@class='iconim code']//b[contains(text(),'UCAS code:')]/..//text()"
                    "|//li[@class='iconim points']//b[contains(text(),'UCAS points:')]/..//text()"
                ).extract()
            print("isup_str: ", isup_str)
            print("isup: ", isup)
            if "https://courses.bolton.ac.uk/course" in item['url']:
                if "postgraduate" in isup_str or len(isup) == 0:
                    print("******存到数据库*****")
                    yield item

        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #27

Показать файл

Файл: UniversityOfHertfordshire_P.py Проект: histudent/python_spider

    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        # item['country'] = "England"
        # item["website"] = "http://www.herts.ac.uk/"
        item['university'] = "University of Hertfordshire"
        item['url'] = response.url
        item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        try:
            # //div[@id='content']/main/div[@class='course-sub-head']/a
            department = response.xpath(
                "//div[@id='content']/main/div[@class='course-sub-head']/a//text()|//div[@class='banner__caption banner__caption--below']//text()"
            ).extract()
            department = ''.join(department).replace("in the ", "").strip()
            item['department'] = department
            print("department: ", department)

            # 专业、学位类型 //div[@id='content']/main/h1
            programmeDegreetype = response.xpath(
                "//div[@id='content']/main/h1//text()|//span[@class='color--red']/..//text()"
            ).extract()
            # print("programmeDegreetype: ", programmeDegreetype)
            programmeDegreetypeStr = ''.join(programmeDegreetype).strip()
            # print("programmeDegreetypeStr: ", programmeDegreetypeStr)
            degreetype = re.findall(r"^\w+\s", programmeDegreetypeStr)
            # print(degreetype)
            if len(degreetype) != 0:
                degreetype = ''.join(list(degreetype[0])).strip()
                # print(degreetype)
                item['degree_name'] = degreetype
            print("item['degree_name']: ", item['degree_name'])
            programme = programmeDegreetypeStr.replace(''.join(degreetype), '')
            # print(programme)
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            duration = response.xpath(
                "//h3[contains(text(),'Key course information')]/following-sibling::ul//*[contains(text(), 'Full')]//text()|"
                "//h4[contains(text(),'Course length')]/following-sibling::div[1]//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration_str = ''.join(duration)

            item['teach_time'] = getTeachTime(duration_str)
            duration_list = getIntDuration(duration_str)
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['teach_time'] = ", item['teach_time'])
            # print("item['duration'] = ", item['duration'])
            # print("item['duration_per'] = ", item['duration_per'])

            location = response.xpath(
                "//h3[contains(text(),'Key course information')]/following-sibling::ul//*[contains(text(), 'Locations')]/../../following-sibling::*//text()|"
                "//h4[contains(text(),'Locations')]/following-sibling::div[1]//text()"
            ).extract()
            clear_space(location)
            item['location'] = ''.join(location).strip()
            # print("item['location'] = ", item['location'])

            # //div[@id='overview']
            overview = response.xpath(
                "//div[@id='overview']|//section[@data-section='section-overview']"
            ).extract()
            overview_en = remove_class(clear_lianxu_space(overview))
            # print("overview_en: ", overview_en)
            # overview_en_re = re.findall(r"<script>.*?</script>", overview_en)
            # print(overview_en_re)
            # if len(overview_en_re) > 0:
            #     for o in overview_en_re:
            #         item['overview_en'] = overview_en.replace(o, "")
            # else:
            item['overview_en'] = overview_en
            # print("item['overview_en']: ", item['overview_en'])

            assessment_en = response.xpath(
                "//h3[contains(text(),'Teaching methods')]/following-sibling::*"
            ).extract()
            if len(assessment_en) == 0:
                assessment_en = response.xpath(
                    "//h3[contains(text(),'Teaching methods')]/..").extract()
            if len(assessment_en) > 0:
                item[
                    'assessment_en'] = "<h3>Teaching methods</h3>" + remove_class(
                        clear_lianxu_space(assessment_en))
            # print("item['assessment_en']: ", item['assessment_en'])

            career_en = response.xpath(
                "//h3[contains(text(),'Teaching methods')]/preceding-sibling::*"
            ).extract()
            if len(career_en) == 0:
                career_en = response.xpath(
                    "//h3[contains(text(),'Careers')]|//h3[contains(text(),'Careers')]/following-sibling::*"
                ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en']: ", item['career_en'])

            modules = response.xpath(
                "//div[@id='modules']|//div[@id='module-structure']").extract(
                )
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])

            # //div[@id='fees']
            feeContent = response.xpath(
                "//h4[contains(text(),'International Students')]/following-sibling::h5[contains(text(), 'Full')]/following-sibling::ul[1]//text()"
            ).extract()
            clear_space(feeContent)
            # print("feeContent: ", feeContent)
            feelist = re.findall(r"£[\d,]+", ''.join(feeContent))
            if len(feelist) > 0:
                item['tuition_fee'] = int(feelist[0].replace('£', '').replace(
                    ',', '').strip())
                item['tuition_fee_pre'] = '£'
            # print("item['tuition_fee']: ", item['tuition_fee'])
            # print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])

            # //div[@id='how-to-apply']
            entry_requirements = response.xpath(
                "//h2[contains(text(),'How to apply')]/preceding-sibling::*//text()"
            ).extract()
            item['rntry_requirements'] = clear_lianxu_space(entry_requirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            # print("entry_requirementsStr: ", entry_requirementsStr)
            ielts = re.findall(
                r"IELT[\sa-zA-Z]*\d\.?\d?[\sa-z\(\)]*\d\.?\d?[\sa-z\(\)]{1,100}",
                item['rntry_requirements'])
            # print("ielts: ", ielts)
            item['ielts_desc'] = ''.join(ielts).strip()
            # print("item['ielts_desc']: ", item['ielts_desc'])

            ielts_dict = get_ielts(item['ielts_desc'])
            item['ielts'] = ielts_dict.get('IELTS')
            item['ielts_l'] = ielts_dict.get('IELTS_L')
            item['ielts_s'] = ielts_dict.get('IELTS_S')
            item['ielts_r'] = ielts_dict.get('IELTS_R')
            item['ielts_w'] = ielts_dict.get('IELTS_W')
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

            start_date = response.xpath(
                "//div[@class='how-to-apply-table']//table//td[contains(text(),'Full')]/preceding-sibling::*[2]//text()"
            ).extract()
            clear_space(start_date)
            print("start_date: ", start_date)
            start_date = list(set(start_date))
            item['start_date'] = ','.join(start_date).strip()
            print("item['start_date'] = ", item['start_date'])

            item[
                "require_chinese_en"] = "<p>Chinese 4-year Bachelor degree with 70% or above</p>"
            item[
                "apply_proces_en"] = "https://www.herts.ac.uk/study/how-to-apply"
            yield item
        except Exception as e:
            with open(item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)