Python remove_class примеры использования

Язык программирования: Python

Пространство имен/Пакет: scrapySchool_Australian_ben.remove_tags

Метод/Функция: remove_class

Примеров на hotexamples.com: 30

Python remove_class - 30 примеров найдено. Это лучшие примеры Python кода для scrapySchool_Australian_ben.remove_tags.remove_class, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

Файл: LaTrobeUniversity_U.py Проект: histudent/python_spider

    def parse_major_major(self, majorUrl):
        print("major_majorUrl: ", majorUrl)
        data = requests.get(majorUrl, headers=self.headers_base)
        response = etree.HTML(data.text)
        programme_dict = {}
        programme = response.xpath("//h1//text()")
        programme_str = ''.join(programme).strip()
        programme_dict['programme_en'] = programme_str

        overview_en = response.xpath("//div[@id='intro_txt']")
        overview_en_str = ""
        if len(overview_en) > 0:
            for m in overview_en:
                overview_en_str += etree.tostring(m,
                                                  encoding='unicode',
                                                  pretty_print=False,
                                                  method='html')
        overview_en = remove_class(clear_lianxu_space([overview_en_str]))
        programme_dict['overview_en'] = overview_en

        career_en = response.xpath("//div[@id='intro_txt_2']")
        career_en_str = ""
        if len(career_en) > 0:
            for m in career_en:
                career_en_str += etree.tostring(m,
                                                encoding='unicode',
                                                pretty_print=False,
                                                method='html')
        career_en = remove_class(clear_lianxu_space([career_en_str]))
        programme_dict['career_en'] = career_en
        return programme_dict

Пример #2

Показать файл

Файл: RMITUniversity_U.py Проект: histudent/python_spider

    def parse_career1(self, url, item):
        headers_base = {
            'User-Agent':
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
        }
        data = requests.get(url, headers=headers_base)
        response = etree.HTML(data.text)
        career = response.xpath("//h2[contains(text(),'Career outlook')]/..")
        career_str = ""
        if len(career) > 0:
            for c in career:
                career_str += etree.tostring(c,
                                             encoding='unicode',
                                             method='html')
        item['career_en'] += remove_class(clear_lianxu_space([career_str]))

        if item['career_en'] == "":
            career = response.xpath("//div[@class='m-super-detail-page']")
            career_str = ""
            if len(career) > 0:
                for c in career:
                    career_str += etree.tostring(c,
                                                 encoding='unicode',
                                                 method='html')
            item['career_en'] += remove_class(clear_lianxu_space([career_str]))

Пример #3

Показать файл

Файл: TheUniversityOfNewSouthWales_degree_overview.py Проект: histudent/python_spider

    def parse_feedata(self, response):
        print("==============================")
        print(response.url)
        degree_name = self.resultDict.get(response.url)
        print("degree_name==: ", degree_name)

        degree_overview = response.xpath(
            "//*/a[contains(text(), 'Overview')]/../../following-sibling::*[position()<6]|"
            "//*/a[contains(text(), 'Overview')]/../following-sibling::*[position()<6]"
        ).extract()
        degree_overview = response.xpath(
            "//div[@class='field field-type-text-long']").extract()
        clear_space(degree_overview)
        degree_overview = remove_class(''.join(degree_overview).strip())
        print("degree_overview: ", degree_overview)

        career_en = response.xpath(
            "//*/a[contains(text(), 'Career')]/../preceding-sibling::*[1]/following-sibling::*[position()<6]"
        ).extract()
        clear_space(career_en)
        career_en = remove_class(''.join(career_en).strip())
        print("career_en: ", career_en)

        self.degree_overview_resultDict1[degree_name] = degree_overview
        self.career_resultDict1[degree_name] = career_en
        print("**degree_overview_resultDict1: ",
              self.degree_overview_resultDict1)
        print("*****career_resultDict1: ", self.career_resultDict1)

Пример #4

Показать файл

Файл: RMITUniversity_U.py Проект: histudent/python_spider

    def parse_overviewModules1(self, url, item):
        headers_base = {
            'User-Agent':
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
        }
        data = requests.get(url, headers=headers_base)
        response = etree.HTML(data.text)
        # ucascode = response.xpath("//div[@class='c-summary-cell']//div[@class='inside']/span[2]//text()")
        # item['ucas_code'] = ''.join(ucascode)
        # print("item['ucas_code']: ", item['ucas_code'])

        # //html//div[@class='module']//div[8]/div[1]/span[2]
        department = response.xpath(
            "//span[contains(text(),'School:')]/following-sibling::*//text()")
        clear_space(department)
        item['department'] = ''.join(department).strip()
        print("item['department']: ", item['department'])

        # ielts = response.xpath("//div[@class='c-summary-cell hidden-cell b-international'][last()]//text()")
        # clear_space(ielts)
        # item['IELTS'] = ''.join(ielts)
        # print("item['IELTS']: ", item['IELTS'])

        overview = response.xpath(
            "//div[@class='c-summary c-summary-3-col clearfix']/following-sibling::div[1]"
        )
        overview_str = ""
        if len(overview) > 0:
            for o in overview:
                overview_str += etree.tostring(o,
                                               encoding='unicode',
                                               method='html')
        item['degree_overview_en'] = remove_class(
            clear_lianxu_space([overview_str]))
        # print("item['degree_overview_en']: ", item['degree_overview_en'])

        modules = response.xpath(
            "//h3[contains(text(),'Program Structure')]|//h3[contains(text(),'Program Structure')]/following-sibling::*|"
            "//*[contains(text(),'Program structure')]|//*[contains(text(),'Program structure')]/following-sibling::*|"
            "//h2[contains(text(),'Structure')]/..|//h2[contains(text(),'Stucture')]/..|"
            "//h2[contains(text(),'Specialisations and electives')]/..")
        modules_str = ""
        if len(modules) > 0:
            for o in modules:
                modules_str += etree.tostring(o,
                                              encoding='unicode',
                                              method='html')
        item['modules_en'] = remove_class(clear_lianxu_space([modules_str]))

Пример #5

Показать файл

Файл: RMITUniversity_U.py Проект: histudent/python_spider

    def parse_entryrequirements1(self, url, item):
        headers_base = {
            'User-Agent':
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
        }
        data = requests.get(url, headers=headers_base)
        response = etree.HTML(data.text)
        entry_requirements = response.xpath(
            "//div[@data-page-id='overview']/div[position()>=3]")
        entry_requirements_str = ""
        if len(entry_requirements) > 0:
            for c in entry_requirements:
                entry_requirements_str += etree.tostring(c,
                                                         encoding='unicode',
                                                         method='html')
        item['rntry_requirements_en'] += remove_class(
            clear_lianxu_space([entry_requirements_str]))
        # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

        # ielts_desc
        ielts_desc = response.xpath(
            "//*[contains(text(),'IELTS (Academic): ')]//text()")
        item['ielts_desc'] = ''.join(ielts_desc).strip()

        # toefl_desc
        toefl_desc = response.xpath(
            "//*[contains(text(),'TOEFL (Internet Based Test - IBT): ')]//text()"
        )
        item['toefl_desc'] = ''.join(toefl_desc).strip()

Пример #6

Показать файл

    def parse_modules(self, modulesUrl):
        print("modulesUrl: ", modulesUrl)
        data = requests.get(modulesUrl, headers=self.headers_base)
        response = etree.HTML(data.text)
        # print("response.url: ", data.url)
        # modules = response.xpath("//div[@id='degree-structure']")
        modules = response.xpath(
            "///section[@id='overview']//div[@class='course-section__main']")
        # clear_space(modules)
        modules_str = ""
        if len(modules) > 0:
            for m in modules:
                modules_str += etree.tostring(m,
                                              encoding='unicode',
                                              pretty_print=False,
                                              method='html')
        # modulesRe = re.findall(r"Next.*<", modules_str)
        # print("===", modulesRe)
        modules_en = remove_class(clear_lianxu_space([modules_str]))

        major_list = response.xpath(
            "//span[contains(text(),'Majors')]/../../following-sibling::ul/li[1]/div/span[1]//text()"
        )
        major_overview_list = response.xpath(
            "//span[contains(text(),'Majors')]/../../following-sibling::ul/li[2]"
        )
        # major_overview_list_tmp = []
        # for m in major_overview_list:
        #     tmp = m.xpath
        return modules_en, major_list, major_overview_list

Пример #7

Показать файл

Файл: TheUniversityOfNewSouthWales_U_handbook2019.py Проект: histudent/python_spider

    def parse_major_detile(self, major_url, item):
        item['url'] = major_url
        print("item['url']_major: ", item['url'])
        data = requests.get(major_url, headers=self.headers_base)
        response = etree.HTML(
            data.text.replace('<?xml version="1.0" encoding="utf-8"?>', ""))
        # print("===1=", response)
        programme = response.xpath(
            "//div[@class='internalContentWrapper']/h1[1]//text()")
        print("prog ", programme)
        programme_str = ''.join(programme)
        if "-" in programme_str:
            programme_list = programme_str.split("-")
            item['programme_en'] = ''.join(programme_list[:-1]).strip()
        else:
            item['programme_en'] = programme_str
        print("item['programme_en']_major: ", item['programme_en'])

        overview_en = response.xpath(
            "//h2[contains(text(),'Stream Outline')]/../preceding-sibling::*[1]/following-sibling::*[position()<3]|"
            "//td[@class='mainInformation']//div[1]")
        overview_en_str = ""
        if len(overview_en) > 0:
            for m in overview_en:
                # print("===", overview_en_str)
                overview_en_str += etree.tostring(m,
                                                  encoding='unicode',
                                                  method='html')
        item['overview_en'] = remove_class(
            clear_lianxu_space([overview_en_str]))
        print("item['overview_en']_major: ", item['overview_en'])

        modules_en = response.xpath(
            "//a[@name='planstructure']/preceding-sibling::*[1]/following-sibling::*[position()<last()-1]|"
            "//table[@class='tabluatedInfo']")
        modules_en_str = ""
        if len(modules_en) > 0:
            for m in modules_en:
                modules_en_str += etree.tostring(m,
                                                 encoding='unicode',
                                                 method='html')
        item['modules_en'] = remove_class(clear_lianxu_space([modules_en_str]))
        print("item['modules_en']_major: ", item['modules_en'])

        new_url_list = response.xpath(
            "//table[@class='tabluatedInfo']//tr/td/a/@href")
        return new_url_list

Пример #8

Показать файл

Файл: TheUniversityOfMelbourne_U.py Проект: histudent/python_spider

    def parse_modules(self, modulesUrl):
        print("课程结构链接============" + modulesUrl + "===============")
        data = requests.get(modulesUrl, headers=self.headers)
        response = etree.HTML(data.text)

        modules_en = response.xpath("//div[@class='with-jumpnav']/*")
        modules_en_str = ""
        if len(modules_en) > 0:
            for deg_desc in modules_en:
                modules_en_str += etree.tostring(deg_desc,
                                                 encoding='unicode',
                                                 method='html')
        modules_en1 = remove_class(clear_lianxu_space([modules_en_str]))
        return modules_en1

Пример #9

Показать файл

    def parse_apply(self, applyUrl, item):
        print("applyUrl: ", applyUrl)
        data = requests.get(applyUrl, headers=self.headers_base)
        response = etree.HTML(data.text)
        # print("response.url: ", data.url)
        how_to_apply = response.xpath(
            "//div[@id='apply-now']|//div[@id='how-to-apply']")
        # clear_space(how_to_apply)
        how_to_apply_str = ""
        if len(how_to_apply) > 0:
            for m in how_to_apply:
                how_to_apply_str += etree.tostring(m,
                                                   encoding='unicode',
                                                   pretty_print=False,
                                                   method='html')
        item['apply_desc_en'] = remove_class(
            clear_lianxu_space([how_to_apply_str]))
        print("跳转获得：item['apply_desc_en']: ", item['apply_desc_en'])

        deadline = response.xpath(
            "//*[contains(text(),'Application closing dates')]/../following-sibling::*[1]//text()|"
            "//strong[contains(text(),'Application Deadlines')]/../following-sibling::*[position()<3]//text()"
        )
        item['deadline'] = clear_lianxu_space(deadline)
        print("跳转获得：item['deadline']: ", item['deadline'])

        apply_documents_en = response.xpath(
            "//h2[contains(text(),'Application Checklist')]/..")
        doc = ""
        if len(apply_documents_en) > 0:
            for a in apply_documents_en:
                doc += (etree.tostring(a,
                                       encoding='unicode',
                                       pretty_print=False,
                                       method='html'))
        item['apply_documents_en'] = remove_class(clear_lianxu_space([doc]))
        print("跳转获得：item['apply_documents_en']: ", item['apply_documents_en'])

Пример #10

Показать файл

    def parse_modules(self, modules_a_url):
        headers_base = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"}
        data = requests.get(modules_a_url, headers=headers_base)
        response = etree.HTML(data.text)

        modules_en = response.xpath(
            "//div[@id='structure_CoreCore']/div[last()]")
        modules_en_str = ""
        if len(modules_en) > 0:
            for m in modules_en:
                modules_en_str += etree.tostring(m, encoding='unicode', method='html')
                modules_en = remove_class(clear_lianxu_space([modules_en_str]))
        # print('modules_en: ', modules_en)
        return modules_en

Пример #11

Показать файл

 def parse_career(self, career_url):
     print("career_url: ", career_url)
     data = requests.get(career_url, headers=self.headers_base)
     response = etree.HTML(data.text)
     career = response.xpath(
         "//div[@class='with-jumpnav']//div[@class='course-content']")
     # clear_space(modules)
     career_str = ""
     if len(career) > 0:
         for m in career:
             career_str += etree.tostring(m,
                                          encoding='unicode',
                                          pretty_print=False,
                                          method='html')
     career_en = remove_class(clear_lianxu_space([career_str]))
     return career_en

Пример #12

Показать файл

Файл: LaTrobeUniversity_U.py Проект: histudent/python_spider

    def parse_modules(self, modulesUrl):
        # print("modulesUrl: ", modulesUrl)
        data = requests.get(modulesUrl, headers=self.headers_base)
        response = etree.HTML(data.text)

        modules = response.xpath(
            "//h3[contains(text(),'Course structure')]|//h3[contains(text(),'Course structure')]/following-sibling::*[position()<last()]|"
            "//h3[contains(text(),'Course Structure')]|//h3[contains(text(),'Course Structure')]/following-sibling::*[position()<last()]"
        )
        modules_str = ""
        if len(modules) > 0:
            for m in modules:
                modules_str += etree.tostring(m,
                                              encoding='unicode',
                                              pretty_print=False,
                                              method='html')
        modules_en = remove_class(clear_lianxu_space([modules_str]))
        return modules_en

Пример #13

Показать файл

Файл: RMITUniversity_U.py Проект: histudent/python_spider

    def parse_modules1(self, url, item):
        headers_base = {
            'User-Agent':
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
        }
        data = requests.get(url, headers=headers_base)
        response = etree.HTML(data.text)

        modules = response.xpath(
            "//h3[contains(text(),'Program Structure')]|//h3[contains(text(),'Program Structure')]/following-sibling::*|"
            "//h2[contains(text(),'Structure')]/..|"
            "//h2[contains(text(),'Specialisations and electives')]/..")
        modules_str = ""
        if len(modules) > 0:
            for o in modules:
                modules_str += etree.tostring(o,
                                              encoding='unicode',
                                              method='html')
        item['modules_en'] += remove_class(clear_lianxu_space([modules_str]))

Пример #14

Показать файл

Файл: TheUniversityOfMelbourne_U.py Проект: histudent/python_spider

    def parse_rntry_tuition_fee(self, rntry_tuition_fee_url):
        data = requests.get(rntry_tuition_fee_url, headers=self.headers)
        response = etree.HTML(data.text)
        rntry_requirements_en_str = ""

        rntry_requirements_en = response.xpath(
            "//div[@class='entry-requirements']|//div[@class='prerequisites']")
        if len(rntry_requirements_en) > 0:
            for deg_desc in rntry_requirements_en:
                rntry_requirements_en_str += remove_class(
                    clear_lianxu_space([
                        etree.tostring(deg_desc,
                                       encoding='unicode',
                                       method='html')
                    ]))

        tuition_fee = response.xpath(
            "//*[contains(text(),'Typical course fee for 2019:')]//text()")
        clear_space(tuition_fee)
        tuition_fee_str = ''.join(tuition_fee).strip()
        return [rntry_requirements_en_str, tuition_fee_str]

Пример #15

Показать файл

Файл: LaTrobeUniversity_U.py Проект: histudent/python_spider

    def parse_major(self, majorUrl, maj):
        print("majorUrl: ", majorUrl)
        data = requests.get(majorUrl, headers=self.headers_base)
        response = etree.HTML(data.text)
        programme_dict = {}
        # ul_major = response.xpath("//div[@id='why_study']//ul[@type='disc']/li//a/@href")
        ul_major = response.xpath(
            "//div[@id='why_study']//ul[@type='disc']/li//a[contains(text()," +
            "'" + remove_tags(maj) + "'" + ")]/@href")
        clear_space(ul_major)
        print("ul_major: ", ul_major)
        programme_dict_list = []
        if len(ul_major) == 0:
            programme = response.xpath("//h1//text()")
            programme_str = ''.join(programme).strip()
            programme_dict['programme_en'] = programme_str

            overview_en = response.xpath(
                "//div[@id='overview']|//div[@id='why_study']")
            overview_en_str = ""
            if len(overview_en) > 0:
                for m in overview_en:
                    overview_en_str += etree.tostring(m,
                                                      encoding='unicode',
                                                      pretty_print=False,
                                                      method='html')
            overview_en = remove_class(clear_lianxu_space([overview_en_str]))
            programme_dict['overview_en'] = overview_en
            print("overview_en: ", overview_en)
            programme_dict_list.append(programme_dict)
            # return programme_dict
        else:
            # for ul in ul_major:
            programme_dict = self.parse_major_major(ul_major[0])
            programme_dict_list.append(programme_dict)
        return programme_dict_list

Пример #16

Показать файл

    def parse_data(self, response):
        # 判断是否学位下面还有专业
        specialisations = response.xpath(
            "//h2[contains(text(),'Specialisations')]/following-sibling::*//a/@href"
        ).extract()
        # print("specialisations: ", specialisations, response.url)
        if len(specialisations) > 0:
            for link in specialisations:
                if "http" in link:
                    url = link
                else:
                    url = "http://study.unisa.edu.au" + link
                yield scrapy.Request(url, callback=self.parse_data)
        else:
            item = get_item(ScrapyschoolAustralianBenItem)
            item['university'] = "University of South Australia"
            # item['country'] = 'Australia'
            # item['website'] = 'http://www.unisa.edu.au/'
            item['url'] = response.url
            print("===========================")
            print(response.url)
            item['degree_type'] = 1
            try:
                programme = response.xpath(
                    "//div[@class='title-row']/h1/text()").extract()
                clear_space(programme)
                item['degree_name'] = ''.join(programme).replace(
                    "(International)", "").strip()
                print("item['degree_name']: ", item['degree_name'])

                pro_re = re.findall(r"Bachelor", item['degree_name'])
                print("pre_re: ", pro_re)
                if len(pro_re) < 2:
                    programme_re = re.findall(r"\(.+\)", item['degree_name'])
                    print("programme_re: ", programme_re)
                    if len(programme_re) > 0:
                        if ''.join(programme_re).strip() != "(Honours)":
                            item['programme_en'] = ''.join(
                                programme_re).replace("(",
                                                      "").replace(")",
                                                                  "").strip()
                        else:
                            item['programme_en'] = item['degree_name'].replace(
                                "Bachelor of",
                                "").replace("(Honours)",
                                            "").strip().strip("in").strip()
                    else:
                        item['programme_en'] = item['degree_name'].replace(
                            "Bachelor of", "").strip().strip("in").strip()
                    print("item['programme_en']: ", item['programme_en'])

                    start_date = response.xpath(
                        "//span[contains(text(), 'Start')]/../text()").extract(
                        )
                    clear_space(start_date)
                    # print("start_date: ", start_date)
                    item['start_date'] = getStartDateMonth(
                        ', '.join(start_date))
                    print("item['start_date']: ", item['start_date'])

                    # //span[contains(text(),'Campus')]/../a
                    location = response.xpath(
                        "//span[contains(text(),'Campus')]/../a//text()"
                    ).extract()
                    clear_space(location)
                    item['location'] = ''.join(location).strip()
                    print("item['location']: ", item['location'])

                    duration = response.xpath(
                        "//span[contains(text(),'Duration')]/../text()"
                    ).extract()
                    clear_space(duration)
                    item['duration'] = ''.join(duration).strip()
                    print("item['duration']: ", item['duration'])

                    tuition_fee = response.xpath(
                        "//span[contains(text(),'2019: AUD$')]//text()|"
                        "//span[contains(text(),'Fees')]/../text()").extract()
                    print("tuition_fee: ", tuition_fee)
                    clear_space(tuition_fee)
                    tuition_fee = getTuition_fee(''.join(tuition_fee))
                    item['tuition_fee'] = str(tuition_fee)
                    if item['tuition_fee'] == '0':
                        item['tuition_fee'] = None
                    print("item['tuition_fee']: ", item['tuition_fee'])

                    # //span[contains(text(),'English Language Requirements')]/..
                    ielts = response.xpath(
                        "//span[contains(text(),'English Language Requirements')]/../ul//text()"
                    ).extract()
                    clear_space(ielts)
                    item['ielts_desc'] = ' '.join(ielts).strip()
                    print("item['ielts_desc']: ", item['ielts_desc'])

                    ieltlsrw = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
                    if len(ieltlsrw) > 0:
                        item["ielts"] = ieltlsrw[0]

                    ielts_l_re = re.findall(r"listening\s\[.*?\]",
                                            item['ielts_desc'])
                    item["ielts_l"] = ''.join(ielts_l_re).replace(
                        "listening", "").replace("[", "").replace("]",
                                                                  "").strip()

                    ielts_s_re = re.findall(r"speaking\s\[.*?\]",
                                            item['ielts_desc'])
                    item["ielts_s"] = ''.join(ielts_s_re).replace(
                        "speaking", "").replace("[", "").replace("]",
                                                                 "").strip()

                    ielts_r_re = re.findall(r"reading\s\[.*?\]",
                                            item['ielts_desc'])
                    item["ielts_r"] = ''.join(ielts_r_re).replace(
                        "reading", "").replace("[", "").replace("]",
                                                                "").strip()

                    ielts_w_re = re.findall(r"writing\s\[.*?\]",
                                            item['ielts_desc'])
                    item["ielts_w"] = ''.join(ielts_w_re).replace(
                        "writing", "").replace("[", "").replace("]",
                                                                "").strip()
                    print(
                        "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                        % (item['ielts'], item['ielts_l'], item['ielts_s'],
                           item['ielts_r'], item['ielts_w']))

                    # //div[@class='page-info-block-inner']//ul[@id='entry-requirements']
                    entry_requirements = response.xpath(
                        "//div[@class='page-info-block-inner']//ul[@id='entry-requirements']"
                    ).extract()
                    item['rntry_requirements_en'] = remove_class(
                        clear_lianxu_space(entry_requirements))
                    print("item['rntry_requirements_en']: ",
                          item['rntry_requirements_en'])

                    degree_overview_en = response.xpath(
                        "//h2[contains(text(),'Degree overview')]/../../.."
                    ).extract()
                    item['degree_overview_en'] = remove_class(
                        clear_lianxu_space(degree_overview_en))
                    print("item['degree_overview_en']: ",
                          item['degree_overview_en'])

                    overview_en = response.xpath(
                        "//h2[contains(text(),'Snapshot')]/..|"
                        "//h3[contains(text(),'Snapshot')]/..").extract()
                    item['overview_en'] = remove_class(
                        clear_lianxu_space(overview_en))
                    print("item['overview_en']: ", item['overview_en'])

                    modules_en = response.xpath(
                        "//h2[@class='theme-white'][contains(text(), 'Degree structure')]/../..|"
                        "//h3[contains(text(),'Degree structure')]/../.."
                    ).extract()
                    item['modules_en'] = remove_class(
                        clear_lianxu_space(modules_en))
                    print("item['modules_en']: ", item['modules_en'])

                    career_en = response.xpath(
                        "//h2[contains(text(),'Your career')]/../../..|"
                        "//h3[contains(text(),'Your career')]/..").extract()
                    item['career_en'] = remove_class(
                        clear_lianxu_space(career_en))
                    print("item['career_en']: ", item['career_en'])

                    apply_desc_en = response.xpath(
                        "//h2[contains(text(),'How to apply')]/../../.."
                    ).extract()
                    item['apply_desc_en'] = remove_class(
                        clear_lianxu_space(apply_desc_en))
                    print("item['apply_desc_en']: ", item['apply_desc_en'])

                    if "research" not in item['degree_name']:
                        yield item
            except Exception as e:
                with open("scrapySchool_Australian_ben/error/" +
                          item['university'] + str(item['degree_type']) +
                          ".txt",
                          'a',
                          encoding="utf-8") as f:
                    f.write(
                        str(e) + "\n" + response.url +
                        "\n========================\n")
                print("异常：", str(e))
                print("报错url：", response.url)

Пример #17

Показать файл

Файл: LaTrobeUniversity_U.py Проект: histudent/python_spider

    def parses(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = 'La Trobe University'
        item['url'] = response.url
        # item['location']='Melbourne'
        item['degree_type'] = 1
        print("================================================")
        print(response.url)
        try:
            # 学位名称
            degree_name = response.xpath(
                '//h1[contains(text(),"Bachelor of")]/text()').extract()
            clear_space(degree_name)
            item['degree_name'] = ''.join(degree_name).strip()
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Bachelor",
                                item['degree_name'].replace("(Honours)", ""))
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2:
                programme_re = re.findall(
                    r"\(.+\)",
                    item['degree_name'].replace("(Advanced)",
                                                "").replace("(Honours)",
                                                            "").strip())
                if len(programme_re) > 0:
                    item['programme_en'] = ''.join(programme_re).replace(
                        "(", "").replace(")", "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace(
                        "Bachelor of",
                        "").replace("(Honours)", "").replace("Master of ",
                                                             "").strip()
                print("item['programme_en']: ", item['programme_en'])

                start_date = response.xpath(
                    '//div[contains(text(),"tart")]/following-sibling::div//text()'
                ).extract()
                # print('start_date: ',start_date)
                item['start_date'] = getStartDateMonth(''.join(start_date))
                if item['start_date'] == "":
                    item['start_date'] = ''.join(start_date).strip()
                # print("item['start_date']: ", item['start_date'])

                duration = response.xpath(
                    '//div[contains(text(),"uration")]/following-sibling::div//text()'
                ).extract()
                # print('duration: ',duration)
                item['duration'] = ''.join(duration).strip()
                # print("item['duration']: ", item['duration'])

                fee = response.xpath(
                    '//h3[contains(text(),"tuition fee")]/following-sibling::p[1]/text()'
                ).extract()
                # print('fee: ',fee)
                fee = ''.join(fee).strip()
                tuition = fee.replace(' ', '')
                item['tuition_fee'] = tuition[0:99]
                # print("item['tuition_fee']: ", item['tuition_fee'])

                overview = response.xpath(
                    '//section[@id="overview"]/div[@class="block"]').extract()
                item['degree_overview_en'] = remove_class(
                    clear_lianxu_space(overview))
                # print("item['degree_overview_en']: ", item['degree_overview_en'])

                rntry = response.xpath(
                    '//section[@id="entry-requirements"]').extract()
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space(rntry))
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                career = response.xpath(
                    '//section[@id="career-outcomes"]').extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # print("item['career_en']: ", item['career_en'])

                htp = response.xpath('//section[@id="how-to-apply"]').extract()
                item['apply_desc_en'] = remove_class(clear_lianxu_space(htp))
                # print("item['apply_desc_en']: ", item['apply_desc_en'])

                # //ul[@class='list-arrows']//li
                location_dict = {
                    'BU': 'Melbourne',
                    'BE': 'Bendigo',
                    'CI': 'City',
                    'MI': 'Mildura',
                    'OT': 'Other',
                    'FS': 'Franklin Street',
                    'SH': 'Shepparton',
                    'SY': 'Sydney',
                    'ON': 'Online',
                    'WO': 'Albury-Wodonga',
                }
                location = response.xpath(
                    "//ul[@class='list-arrows']//li//text()").extract()
                # print("location: ", location)
                item['location'] = ''.join(location).replace("(Bundoora)",
                                                             "").strip()
                if item['location'] == "":
                    location_key = response.url.replace(
                        "https://www.latrobe.edu.au/courses/data/2019/international/",
                        "").strip()
                    # print("location_key1: ", location_key)
                    location_key = location_key.split("/")[0]
                    # print("location_key: ", location_key)
                    item['location'] = location_dict.get(
                        ''.join(location_key).upper())
                # print("item['location']: ", item['location'])

                ielts = response.xpath(
                    '//p[contains(text(),"IELTS")]/text()').extract()
                item['ielts_desc'] = ''.join(ielts).strip()
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ielts = get_ielts(item['ielts_desc'])
                item['ielts'] = ielts.get('IELTS')
                item['ielts_l'] = ielts.get('IELTS_L')
                item['ielts_s'] = ielts.get('IELTS_S')
                item['ielts_r'] = ielts.get('IELTS_R')
                item['ielts_w'] = ielts.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #        item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                modules_url = response.xpath(
                    '//ul[@class="list-arrows"]/li[1]/a/@href').extract()
                clear_space(modules_url)
                if modules_url != []:
                    try:
                        item['modules_en'] = self.parse_modules(modules_url[0])
                    except:
                        item['modules_en'] = ""
                # print("item['modules_en']: ", item['modules_en'])

                item[
                    'apply_proces_en'] = "https://www.latrobe.edu.au/international/how-to-apply/undergraduate-and-postgraduate"

                item['overview_en'] = item['degree_overview_en']
                # programme_major = response.xpath('//section[@id="overview"]/div[@class="block"]//ul/li').extract()
                programme_major = response.xpath(
                    # '//p[contains(text(),"Our Majors are:")]/following-sibling::ul/li|'
                    '//p[contains(text(),"Melbourne majors")]/following-sibling::ul[1]/li|'
                    '//p[contains(text(),"Choose from five majors")]/following-sibling::ul[1]/li|'
                    '//th[contains(text(),"Minors")]/../preceding-sibling::*//td[contains(text(),"Yes")][1]/preceding-sibling::td|'
                    '//p[contains(text(),"disciplines:")]/following-sibling::ul[1]/li|'
                    # '//th[contains(text(),"Minors")]/../preceding-sibling::tr|'
                    '//p[contains(text(),"subjects and electives including")]/following-sibling::ul[1]/li'
                ).extract()
                print(len(programme_major))
                if len(programme_major) == 0:
                    yield item
                else:
                    for maj in programme_major:
                        print("***************************" +
                              str(programme_major.index(maj) + 1) +
                              "****************************")
                        programme_major1 = response.xpath(
                            # '//p[contains(text(),"Our Majors are:")]/following-sibling::ul/li|'
                            '//p[contains(text(),"Melbourne majors")]/following-sibling::ul[1]/li//a[contains(text(),'
                            + '"' + remove_tags(maj) + '"' + ')]/@href|'
                            '//p[contains(text(),"Choose from five majors")]/following-sibling::ul[1]/li//a[contains(text(),'
                            + '"' + remove_tags(maj) + '"' + ')]/@href|'
                            '//th[contains(text(),"Minors")]/../preceding-sibling::*//td[contains(text(),"Yes")][1]/preceding-sibling::td//a[contains(text(),'
                            + '"' + remove_tags(maj) + '"' + ')]/@href|'
                            '//p[contains(text(),"disciplines:")]/following-sibling::ul[1]/li//a[contains(text(),'
                            + '"' + remove_tags(maj) + '"' + ')]/@href|'
                            # '//th[contains(text(),"Minors")]/../preceding-sibling::tr|'
                            '//p[contains(text(),"subjects and electives including")]/following-sibling::ul[1]/li//a[contains(text(),'
                            + '"' + remove_tags(maj) + '"' +
                            ')]/@href').extract()
                        # programme_major1 = response.xpath("//a[contains(text(),"+"'"+remove_tags(maj)+"'"+")]/@href").extract()
                        if len(programme_major1) == 0:
                            item['programme_en'] = remove_tags(maj).replace(
                                "Yes", "").replace("*", "").strip()
                            print("不用跳转的item['programme_en']_major: ",
                                  item['programme_en'])
                            yield item
                        else:
                            programme_dict_list = self.parse_major(
                                programme_major1[0], remove_tags(maj))
                            print("programme_dict_list: ", programme_dict_list)
                            for programme_dict in programme_dict_list:
                                item['programme_en'] = programme_dict.get(
                                    'programme_en')
                                item['overview_en'] = programme_dict.get(
                                    'overview_en')

                                # item['programme_en'] = ''.join(programme_major1).strip()
                                print("跳转之后的链接item['programme_en']_major: ",
                                      item['programme_en'])
                                print("跳转之后的链接item['overview_en']_major: ",
                                      item['overview_en'])
                                yield item
                        # programme_major1 = response.xpath(
                        #                     # '//p[contains(text(),"Our Majors are:")]/following-sibling::ul[1]/li['+str(i+1)+']//a/@href|'
                        #                      '//p[contains(text(),"Melbourne majors")]/following-sibling::ul[1]/li['+str(i+1)+']//a/@href'
                        #                     '//p[contains(text(),"Choose from five majors")]/following-sibling::ul[1]/li['+str(i+1)+']//a/@href|'
                        #                     # '//h3[contains(text(),"Specialisations, majors and minors")]/following-sibling::table/tbody/tr|'
                        #                     '//p[contains(text(),"disciplines:")]/following-sibling::ul/li['+str(i+1)+']//a/@href|'
                        #                     '//th[contains(text(),"Minors")]/../preceding-sibling::tr['+str(i+1)+']//a/@href|'
                        #                     '//p[contains(text(),"subjects and electives including")]/following-sibling::ul[1]/li['+str(i+1)+']//a/@href').extract()
                        # clear_space(programme_major1)
                        # print("programme_major1: ", programme_major1)
                        # if len(programme_major1) > 0:
                        #     major_url = programme_major1[0]
                        #     programme_dict_list = self.parse_major(major_url)
                        #     print("programme_dict_list: ", programme_dict_list)
                        #     for programme_dict in programme_dict_list:
                        #         item['programme_en'] = programme_dict.get('programme_en')
                        #         item['overview_en'] = programme_dict.get('overview_en')
                        #
                        #         # item['programme_en'] = ''.join(programme_major1).strip()
                        #         print("跳转之后的链接item['programme_en']_major: ", item['programme_en'])
                        #         print("跳转之后的链接item['overview_en']_major: ", item['overview_en'])
                        #         yield item
                        # else:
                        #     programme_major1 = response.xpath(
                        #         '//p[contains(text(),"Our Majors are:")]/following-sibling::ul[1]/li['+str(i+1)+']//text()|'
                        #         '//p[contains(text(),"Choose from five majors")]/following-sibling::ul[1]/li['+str(i+1)+']//text()|'
                        #         # '//h3[contains(text(),"Specialisations, majors and minors")]/following-sibling::table/tbody/tr|'
                        #         '//p[contains(text(),"disciplines:")]/following-sibling::ul[1]/li['+ str(i+1)+']//text()|'
                        #         '//th[contains(text(),"Minors")]/../preceding-sibling::tr['+str(i+1)+']//text()|'
                        #         '//p[contains(text(),"subjects and electives including")]/following-sibling::ul[1]/li['+str(i+1)+']//text()').extract()
                        #     item['programme_en'] = ''.join(programme_major1).replace("Yes", "").replace("*", "").strip()
                        #     print("不用跳转的item['programme_en']_major: ", item['programme_en'])
                        #     yield item
        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #18

Показать файл

Файл: EdithCowanUniversity_U.py Проект: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "Edith Cowan University"
        # item['country'] = 'Australia'
        # item['website'] = 'https://www.uts.edu.au'
        item['url'] = response.url
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        # 组合字典
        links = [
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-south-west",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-psychology-and-addiction-studies",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-psychology-and-counselling",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-psychology",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-psychology-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-psychology-criminology-and-justice",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-bachelor-of-commerce",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-bachelor-of-media-and-communication",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-bachelor-of-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-commerce-bachelor-of-arts-psychology",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-contemporary-arts",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-counselling",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-criminology-and-justice",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-criminology-and-justice-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-design",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-laws-bachelor-of-arts",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-laws-bachelor-of-criminology-and-justice",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-laws-bachelor-of-psychological-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-media-and-communication",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-psychological-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-psychology",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-psychology-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-social-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-social-science-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-social-work",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-social-work-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-youth-work",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-bachelor-of-commerce",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-commerce",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-commerce-professional",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-commerce-bachelor-of-arts-psychology",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-honours-bachelor-of-commerce",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-honours-bachelor-of-laws",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-hospitality-and-tourism-management",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-international-hotel-and-resort-management",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-laws-graduate-entry",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-laws",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-laws-bachelor-of-arts",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-laws-bachelor-of-commerce",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-laws-bachelor-of-criminology-and-justice",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-laws-bachelor-of-psychological-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-marketing-advertising-and-public-relations",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-exercise-and-sports-science-bachelor-of-commerce-sport-business",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-bachelor-of-commerce",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-sport-recreation-and-event-management",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-aviation",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-chemical-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-civil-and-environmental-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-civil-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-computer-systems-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-computer-systems-honours-bachelor-of-computer-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-electrical-and-renewable-energy-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-electrical-power-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-electronics-and-communications-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-instrumentation-control-and-automation-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-marine-and-offshore-engineering-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-mechanical-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-mechatronics-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-mechatronics-honours-bachelor-of-technology-motorsports",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-naval-architecture-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-ocean-engineering-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-petroleum-engineering-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-honours-bachelor-of-commerce",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-honours-bachelor-of-laws",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-honours-bachelor-of-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-bachelor-of-commerce",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-technology-aeronautical",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-technology-electronic-and-computer-systems",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-technology-engineering",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-technology-motorsports",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-health-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-health-science-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-medical-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-biomedical-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-exercise-and-sports-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-exercise-and-sports-science-bachelor-of-commerce-sport-business",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-exercise-science-and-rehabilitation",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-medical-science-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-occupational-therapy",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-occupational-therapy-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-paramedical-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-sports-science-and-football",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-sports-science-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-bachelor-of-commerce",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-speech-pathology",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-speech-pathology-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-nursing-studies",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-nursing",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-nursing-bachelor-of-science-midwifery",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-computer-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-computer-science-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-counter-terrorism-security-and-intelligence",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-computer-systems-honours-bachelor-of-computer-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-honours-bachelor-of-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-information-technology",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-information-technology-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-biological-sciences",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-conservation-and-wildlife-biology",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-cyber-security",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-environmental-management",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-marine-and-freshwater-biology",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-mathematics-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-physics-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-security",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-security-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-bachelor-of-commerce",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-sustainability",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-education-early-childhood-studies",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-education-primary",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-education-secondary",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-acting",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-arts-management",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-arts-management-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-dance",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-dance-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-music-theatre",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-education-secondary",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-music",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-music-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-performing-arts",
        ]
        programme_dict = {}
        programme_list = [
            "Bachelor of Arts",
            "Bachelor of Arts (South West)",
            "Bachelor of Arts (Psychology and Addiction Studies)",
            "Bachelor of Arts (Psychology and Counselling)",
            "Bachelor of Arts (Psychology)",
            "Bachelor of Arts (Psychology) Honours",
            "Bachelor of Arts (Psychology, Criminology and Justice)",
            "Bachelor of Arts Honours",
            "Bachelor of Arts/Bachelor of Commerce",
            "Bachelor of Arts/Bachelor of Media and Communication",
            "Bachelor of Arts/Bachelor of Science",
            "Bachelor of Commerce/Bachelor of Arts (Psychology)",
            "Bachelor of Contemporary Arts",
            "Bachelor of Counselling",
            "Bachelor of Criminology and Justice",
            "Bachelor of Criminology and Justice Honours",
            "Bachelor of Design",
            "Bachelor of Laws/Bachelor of Arts",
            "Bachelor of Laws/Bachelor of Criminology and Justice",
            "Bachelor of Laws/Bachelor of Psychological Science",
            "Bachelor of Media and Communication",
            "Bachelor of Psychological Science",
            "Bachelor of Science (Psychology)",
            "Bachelor of Science (Psychology) Honours",
            "Bachelor of Social Science",
            "Bachelor of Social Science Honours",
            "Bachelor of Social Work",
            "Bachelor of Social Work Honours",
            "Bachelor of Youth Work",
            "Bachelor of Arts/Bachelor of Commerce",
            "Bachelor of Commerce",
            "Bachelor of Commerce Professional",
            "Bachelor of Commerce/Bachelor of Arts (Psychology)",
            "Bachelor of Engineering Honours/Bachelor of Commerce",
            "Bachelor of Engineering Honours/Bachelor of Laws",
            "Bachelor of Hospitality and Tourism Management",
            "Bachelor of International Hotel and Resort Management",
            "Bachelor of Laws (Graduate Entry)",
            "Bachelor of Laws",
            "Bachelor of Laws/Bachelor of Arts",
            "Bachelor of Laws/Bachelor of Commerce",
            "Bachelor of Laws/Bachelor of Criminology and Justice",
            "Bachelor of Laws/Bachelor of Psychological Science",
            "Bachelor of Marketing, Advertising and Public Relations",
            "Bachelor of Science (Exercise and Sports Science)/Bachelor of Commerce (Sport Business)",
            "Bachelor of Science/Bachelor of Commerce",
            "Bachelor of Sport, Recreation and Event Management",
            "Bachelor of Aviation",
            "Bachelor of Engineering (Chemical) Honours",
            "Bachelor of Engineering (Civil and Environmental) Honours",
            "Bachelor of Engineering (Civil) Honours",
            "Bachelor of Engineering (Computer Systems) Honours",
            "Bachelor of Engineering (Computer Systems) Honours/Bachelor of Computer Science",
            "Bachelor of Engineering (Electrical and Renewable Energy) Honours",
            "Bachelor of Engineering (Electrical Power) Honours",
            "Bachelor of Engineering (Electronics and Communications) Honours",
            "Bachelor of Engineering (Instrumentation Control and Automation) Honours",
            "Bachelor of Engineering (Marine and Offshore Engineering) Honours",
            "Bachelor of Engineering (Mechanical) Honours",
            "Bachelor of Engineering (Mechatronics) Honours",
            "Bachelor of Engineering (Mechatronics) Honours/Bachelor of Technology (Motorsports)",
            "Bachelor of Engineering (Naval Architecture) Honours",
            "Bachelor of Engineering (Ocean Engineering) Honours",
            "Bachelor of Engineering (Petroleum Engineering) Honours",
            "Bachelor of Engineering Honours/Bachelor of Commerce",
            "Bachelor of Engineering Honours/Bachelor of Laws",
            "Bachelor of Engineering Honours/Bachelor of Science",
            "Bachelor of Engineering Science",
            "Bachelor of Science/Bachelor of Commerce",
            "Bachelor of Technology (Aeronautical)",
            "Bachelor of Technology (Electronic and Computer Systems)",
            "Bachelor of Technology (Engineering)",
            "Bachelor of Technology (Motorsports)",
            "Bachelor of Health Science",
            "Bachelor of Health Science Honours",
            "Bachelor of Medical Science",
            "Bachelor of Science (Biomedical Science)",
            "Bachelor of Science (Exercise and Sports Science)",
            "Bachelor of Science (Exercise and Sports Science)/Bachelor of Commerce (Sport Business)",
            "Bachelor of Science (Exercise Science and Rehabilitation)",
            "Bachelor of Science (Medical Science) Honours",
            "Bachelor of Science (Occupational Therapy)",
            "Bachelor of Science (Occupational Therapy) Honours",
            "Bachelor of Science (Paramedical Science)",
            "Bachelor of Science (Sports Science and Football)",
            "Bachelor of Science (Sports Science) Honours",
            "Bachelor of Science/Bachelor of Commerce",
            "Bachelor of Speech Pathology",
            "Bachelor of Speech Pathology Honours",
            "Bachelor of Science (Nursing Studies)",
            "Bachelor of Science (Nursing)",
            "Bachelor of Science (Nursing)/Bachelor of Science (Midwifery)",
            "Bachelor of Computer Science",
            "Bachelor of Computer Science Honours",
            "Bachelor of Counter Terrorism Security and Intelligence",
            "Bachelor of Engineering (Computer Systems) Honours/Bachelor of Computer Science",
            "Bachelor of Engineering Honours/Bachelor of Science",
            "Bachelor of Information Technology",
            "Bachelor of Information Technology Honours",
            "Bachelor of Science",
            "Bachelor of Science (Biological Sciences)",
            "Bachelor of Science (Conservation and Wildlife Biology)",
            "Bachelor of Science (Cyber Security)",
            "Bachelor of Science (Environmental Management)",
            "Bachelor of Science (Marine and Freshwater Biology)",
            "Bachelor of Science (Mathematics) Honours",
            "Bachelor of Science (Physics) Honours",
            "Bachelor of Science (Security)",
            "Bachelor of Science (Security) Honours",
            "Bachelor of Science Honours",
            "Bachelor of Science/Bachelor of Commerce",
            "Bachelor of Sustainability",
            "Bachelor of Education (Early Childhood Studies)",
            "Bachelor of Education (Primary)",
            "Bachelor of Education (Secondary)",
            "Bachelor of Arts (Acting)",
            "Bachelor of Arts (Arts Management)",
            "Bachelor of Arts (Arts Management) Honours",
            "Bachelor of Arts (Dance)",
            "Bachelor of Arts (Dance) Honours",
            "Bachelor of Arts (Music Theatre)",
            "Bachelor of Education (Secondary)",
            "Bachelor of Music",
            "Bachelor of Music Honours",
            "Bachelor of Performing Arts",
        ]
        for link in range(len(links)):
            url = links[link]
            programme_dict[url] = programme_list[link]
        item['major_type1'] = programme_dict.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            programme = response.xpath(
                "//h2[contains(text(), 'Bachelor of')]//text()").extract()
            clear_space(programme)
            programme = ''.join(programme).strip()
            item['degree_name'] = programme
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Bachelor", item['degree_name'])
            # print("pre_re: ", pro_re)
            if len(pro_re) == 1:
                programme_re = re.findall(
                    r"\(.+\)",
                    item['degree_name'].replace("Honours",
                                                "").replace("(Advanced)", ""))
                if len(programme_re) > 0:
                    item['programme_en'] = ''.join(programme_re).replace(
                        "(", "").replace(")", "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace(
                        "Bachelor of", "").strip()
                print("item['programme_en']: ", item['programme_en'])

                overview = response.xpath(
                    "//span[@id='overview']/..").extract()
                item['degree_overview_en'] = remove_class(
                    clear_lianxu_space(overview))
                # print("item['degree_overview_en']: ", item['degree_overview_en'])

                entry_requirements = response.xpath(
                    "//div[@id='before-you-start']").extract()
                entry_requirements_str = ''.join(entry_requirements).strip()
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space(entry_requirements))
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                modules = response.xpath(
                    "//h4[contains(text(),'Course structure')]|//div[@class='structure-heading']"
                ).extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # print("item['modules_en']: ", item['modules_en'])

                career = response.xpath(
                    "//h4[contains(text(),'Employment opportunities')]|//h4[contains(text(),'Employment opportunities')]/following-sibling::*[1]|"
                    "//h4[contains(text(),'Possible future job titles')]|//h4[contains(text(),'Possible future job titles')]/following-sibling::*[1]"
                ).extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                if item['career_en'] == "":
                    print("***career_en 为空")
                print("item['career_en']: ", item['career_en'])

                location = response.xpath(
                    "//div[@class='courseOverview__info courseOverview__info--international courseOverview__info--noOnline']//div[@class='studyCampus__location studyCampus__location--joondalup studyCampus__location--active']/h4//text()|"
                    "//div[@class='courseOverview__info courseOverview__info--international courseOverview__info--noOnline']//div[@class='studyCampus__location studyCampus__location--mtLawley studyCampus__location--active']/h4//text()|"
                    "//div[@class='courseOverview__info courseOverview__info--international courseOverview__info--noOnline']//div[@class='studyCampus__location studyCampus__location--bunbury studyCampus__location--active']/h4//text()"
                ).extract()
                clear_space(location)
                location = ','.join(location).strip().strip().strip(
                    ',').strip()
                item['location'] = location
                location_tmp = item['location']
                print("item['location']: ", item['location'])

                duration = response.xpath(
                    "//div[@class='courseOverview__info courseOverview__info--international courseOverview__info--noOnline']//p[contains(text(),'year')]//text()|"
                    "//div[@class='courseOverview__info courseOverview__info--international']//p[contains(text(),'year')]//text()"
                ).extract()
                clear_space(duration)
                print("duration: ", duration)
                duration_re = re.findall(r"Start\sSemester.*",
                                         ''.join(duration).strip())
                print(duration_re, "===")
                item['start_date'] = ','.join(duration_re)
                item['duration'] = ''.join(duration).replace(
                    ''.join(duration_re), "").strip()
                print("item['duration']: ", item['duration'])

                other = response.xpath(
                    "//span[@class='courseOverview__subHeader alert-warning alert']//text()"
                ).extract()
                item['other'] = ''.join(other)
                print("item['other']: ", item['other'])

                # https://www.ecu.edu.au/future-students/course-entry/english-competency
                item[
                    'ielts_desc'] = "An overall band score of 6.0, with no individual band less than 6.0"
                item['ielts'] = "6.0"
                item['ielts_l'] = "6.0"
                item['ielts_s'] = "6.0"
                item['ielts_r'] = "6.0"
                item['ielts_w'] = "6.0"
                item[
                    'toefl_desc'] = "Minimum score of 70, with no individual score less than 17"
                item['toefl'] = "70"
                item['toefl_l'] = "17"
                item['toefl_s'] = "17"
                item['toefl_r'] = "17"
                item['toefl_w'] = "17"

                if "This course is not offered for study on-campus to international students with a student visa" not in item[
                        'other']:
                    major_list_url = response.xpath(
                        "//div[@class='section']//ul[@class='core-units']//a/@href"
                    ).extract()
                    clear_space(major_list_url)
                    print("major_list_url: ", major_list_url)
                    print(len(major_list_url))

                    if len(major_list_url) == 0:
                        item['url'] = response.url
                        print("item['url']2: ", item['url'])
                        yield item
                    else:
                        for major_url in major_list_url:
                            headers_base = {
                                'User-Agent':
                                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
                            }
                            data = requests.get(major_url,
                                                headers=headers_base)
                            response_major = etree.HTML(data.text)
                            item['url'] = major_url
                            print("item['url']_major: ", item['url'])

                            programme_major = response_major.xpath(
                                "//span[@id='overview']/following-sibling::h2//text()"
                            )
                            item['programme_en'] = ''.join(
                                programme_major).strip()
                            print("item['programme_en']_major: ",
                                  item['programme_en'])

                            location_major = response_major.xpath(
                                "//div[@class='studyCampus__location studyCampus__location--active']/h4//text()"
                            )
                            item['location'] = ','.join(
                                location_major).strip().strip(',').strip()
                            if item['location'] == "":
                                item['location'] = location_tmp
                            print("item['location']_major: ", item['location'])

                            overview_en = response_major.xpath(
                                "//span[@id='overview']/..")
                            overview_en_str = ""
                            if len(overview_en) > 0:
                                for o in overview_en:
                                    overview_en_str += etree.tostring(
                                        o, encoding='unicode', method='html')
                            item['overview_en'] = remove_class(
                                clear_lianxu_space([overview_en_str]))
                            print("item['overview_en']_major: ",
                                  item['overview_en'])

                            modules_en = response_major.xpath(
                                "//h4[contains(text(),'Structure')]|//h4[contains(text(),'Course structure')]|//div[@class='structure-heading']"
                            )
                            modules_en_str = ""
                            if len(modules_en) > 0:
                                for o in modules_en:
                                    modules_en_str += etree.tostring(
                                        o, encoding='unicode', method='html')
                            item['modules_en'] = remove_class(
                                clear_lianxu_space([modules_en_str]))
                            print("item['modules_en']_major: ",
                                  item['modules_en'])
                            yield item
        except Exception as e:
            with open("scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #19

Показать файл

    def parse(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "Federation University Australia"
        # item['country'] = 'Australia'
        # item['website'] = 'https://search.federation.edu.au'
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        informationUrl = response.url.replace(
            "https://study.federation.edu.au/api/programs_plan_code",
            "https://study.federation.edu.au/#/course")
        print("------------", informationUrl)
        item['url'] = informationUrl
        try:
            # jsonData = clear_space_str(response.body).replace('\"', "'").replace(" ", "")
            jsonData = response.body
            informationDict = json.loads(jsonData)
            print(informationDict)

            international_details = informationDict.get(
                "international_details")
            # print("international_details: ", international_details)

            programme = informationDict.get("title")
            item['degree_name'] = programme
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Bachelor", item['degree_name'])
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2:
                programme_re = re.findall(r"\(.+\)", item['degree_name'])
                if len(programme_re) > 0:
                    if ''.join(programme_re) != "(Honours)":
                        item['programme_en'] = ''.join(programme_re).replace(
                            "(", "").replace(")", "").strip()
                    else:
                        item['programme_en'] = item['degree_name'].replace(
                            "Bachelor of", "").replace("(Honours)",
                                                       "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace(
                        "Bachelor of", "").strip()
                print("item['programme_en']: ", item['programme_en'])

                location = international_details.get("teaching_location")
                item['location'] = location
                # print("item['location']: ", item['location'])

                department = informationDict.get("school_dept")
                item['department'] = department
                # print("item['department']: ", item['department'])

                overviewHtml = informationDict.get("outline")
                # print("overviewHtml: ", overviewHtml)
                delFu = re.findall(r"&\w+;", overviewHtml)
                # print(delFu)
                if len(delFu) != 0:
                    for d in delFu:
                        overviewHtml = overviewHtml.replace(d, "")
                # pageHtml = '<!DOCTYPE html><html><body>' + overviewHtml + '</body></html>'
                item['degree_overview_en'] = remove_class(
                    clear_lianxu_space([overviewHtml]))
                # print("item['degree_overview_en']: ", item['degree_overview_en'])

                duration = international_details.get("duration")
                item['duration'] = duration
                # print("item['duration']: ", item['duration'])

                start_date = informationDict.get("commences")
                item['start_date'] = start_date
                # print("item['start_date']: ", item['start_date'])

                career1 = informationDict.get("careers")
                career1Str = ""
                # print(career1)
                if len(career1) != 0:
                    for career1dict in career1:
                        career1Str += "<p>" + career1dict.get("name") + "</p>"
                career2 = informationDict.get("career_opportunities")
                # print(career2)
                if "<p>" in career2:
                    delFu = re.findall(r"&\w+;", career2)
                    if len(delFu) != 0:
                        for d in delFu:
                            career2 = career2.replace(d, " ")
                    career2 = career2.replace("<br>", " ")
                    # pageHtml = '<!DOCTYPE html><html><body>' + career2 + '</body></html>'
                    career2 = remove_class(clear_lianxu_space([career2]))
                career = career1Str + career2
                item['career_en'] = career
                # print("item['career_en']: ", item['career_en'])

                tuition_fee = international_details.get("annual_fee_int")
                item['tuition_fee'] = tuition_fee
                # print("item['tuition_fee']: ", item['tuition_fee'])

                entry_requirements = international_details.get(
                    "academic_entry_requirements")
                entry_requirements1 = international_details.get(
                    "extra_requirements")
                delFu = re.findall(r"&\w+;", entry_requirements)
                if len(delFu) != 0:
                    for d in delFu:
                        entry_requirements = entry_requirements.replace(d, " ")
                # entry_requirementsHtml = '<!DOCTYPE html><html><body>' + entry_requirements + '</body></html>'
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space([entry_requirements])) + remove_class(
                        clear_lianxu_space([entry_requirements1]))
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                IELTS = international_details.get(
                    "english_language_requirement")
                delFu = re.findall(r"&\w+;", IELTS)
                if len(delFu) != 0:
                    for d in delFu:
                        IELTS = IELTS.replace(d, " ")
                IELTSHtml = '<!DOCTYPE html><html><body>' + IELTS + '</body></html>'
                html = etree.fromstring(IELTSHtml)
                IELTS = html.xpath("//p//text()")
                IELTS = ''.join(IELTS)
                item['ielts_desc'] = IELTS
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ieltlsrw = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
                # print(ieltlsrw)
                if len(ieltlsrw) == 1:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[0]
                    item["ielts_s"] = ieltlsrw[0]
                    item["ielts_r"] = ieltlsrw[0]
                    item["ielts_w"] = ieltlsrw[0]
                elif len(ieltlsrw) == 2:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[1]
                    item["ielts_s"] = ieltlsrw[1]
                    item["ielts_r"] = ieltlsrw[1]
                    item["ielts_w"] = ieltlsrw[1]
                elif len(ieltlsrw) == 3:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[1]
                    item["ielts_s"] = ieltlsrw[1]
                    item["ielts_r"] = ieltlsrw[2]
                    item["ielts_w"] = ieltlsrw[2]
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                # item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                # print("--ces")
                modules = informationDict.get("domestic_details").get(
                    "program_structures").get("majors")
                # print("modules: ", type(modules))
                if len(modules) != 0:
                    modules = modules[0].get("year_levels")
                # print("modules: ", modules)
                # print("modules: ", type(modules))
                modulesStr = ""
                for m in modules:
                    modulesStr += str(m)
                item['modules_en'] = "<div>" + modulesStr.replace(
                    "[", "").replace("]", "").replace("{", "").replace(
                        "}", "") + "</div>"
                print("item['modules_en']: ", item['modules_en'])

                # item['application_date'] = "Monday 5 March, 2018"
                # item['deadline'] = "TBC"
                # item['application_fee'] = "25"

                how_to_apply = informationDict.get("apply_link")
                item['apply_proces_en'] = how_to_apply
                # print("item['apply_proces_en']: ", item['apply_proces_en'])

                # driver = webdriver.Chrome(r"C:\Users\delsk\AppData\Local\Programs\Python\Python36-32\Lib\site-packages\selenium\chromedriver.exe")
                # driver = webdriver.PhantomJS(r"C:\Users\delsk\AppData\Local\Programs\Python\Python36-32\Lib\site-packages\selenium\phantomjs-2.1.1-windows\bin\phantomjs.exe")
                # driver.get(informationUrl)
                # print(driver.page_source)
                # modules = driver.find_element_by_xpath("//div[@class='no-print panel panel-default']").text
                # print(modules)
                # item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # print("item['modules_en']: ", item['modules_en'])
                yield item
        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #20

Показать файл

    def parse_data(self, informationDict, item):
        print("********执行了********")
        programme = informationDict.get("name")
        if programme is None:
            programme = ""
        degree_name = programme.strip()
        print("degree_name: ", degree_name)

        pro_re = re.findall(r"Bachelor", degree_name)
        # print("pre_re: ", pro_re)
        if len(pro_re) == 1:
            degree_name_re_str = degree_name.replace("(Honours)", "").replace(
                "(Birth to Five Years)", "")
            programme_major_re = re.findall(r"in\s.*", degree_name_re_str)
            if len(programme_major_re) > 0:
                programme_en = ''.join(programme_major_re).strip().strip(
                    "in").strip()
            else:
                programme_en = degree_name_re_str.replace("Bachelor of",
                                                          "").strip()
            print("programme_en: ", programme_en)

            department = informationDict.get("faculty")
            if department is None:
                department = ""
            department = department
            # print("department: ", department)

            duration = informationDict.get("handbook_detail_data").get(
                "Program")[0].get("CandidatureLength")
            # print("duration: ", duration)
            duration = ''.join(duration).replace("depending on RPL granted",
                                                 "").strip()
            # print("duration: ", duration)

            start_dateList = informationDict.get("handbook_detail_data").get(
                "Program")[0].get("LocationCommencements")
            # print("start_dateList: ", start_dateList)
            start_date = ""
            location_list = []
            if start_date is not None:
                for st in start_dateList:
                    scope = st.get("Scope")
                    if scope is not None:
                        if 'International' in scope:
                            location = st.get("Location")
                            commencing = st.get("Commencing")
                            if location is None:
                                location = ""
                            if commencing is None:
                                commencing = ""
                            start_date += location + ": " + commencing + ",  "
                            location_list.append(location)
            location_list = list(set(location_list))
            location = ', '.join(location_list).strip().strip(",").strip()
            # print("location: ", location)

            start_date = start_date.strip().strip(',').strip()
            # print("start_date: ", start_date)

            tuition_fee = informationDict.get("handbook_detail_data").get(
                "Program")[0].get("InternationalFees")
            # print("tuition_fee: ", tuition_fee)
            if tuition_fee is None:
                tuition_fee = ""
            else:
                tuition_fee = tuition_fee[0].get("Estimated annual fee")
            tuition_fee = tuition_fee.replace("AUD $", "").replace(",", "")
            # print("item['tuition_fee']: ", item['tuition_fee'])

            # 专业描述
            overview1 = informationDict.get("course_finder_data").get(
                "course_description").get("content").get("override")
            # print("overview1: ", overview1)
            if overview1 is not None:
                if "<p" in overview1:
                    delFu = re.findall(r"&\w+;", overview1)
                    # print(delFu)
                    if len(delFu) != 0:
                        for d in delFu:
                            overview1 = overview1.replace(d, " ")
                    # pageHtml = '<!DOCTYPE html><html><body>' + overview1 + '</body></html>'
                    # html = etree.fromstring(pageHtml)
                    # overview1 = html.xpath("//p//text()")
            elif overview1 is None:
                overview1 = ""
            # print("===overview1: ", overview1)
            overview2 = informationDict.get("course_finder_data").get(
                "key_features").get("content").get("override")
            # print("overview2: ", overview2)
            if overview2 is not None:
                delFu = re.findall(r"&\w+;", overview2)
                # print(delFu)
                if len(delFu) != 0:
                    for d in delFu:
                        overview2 = overview2.replace(d, " ")
                # pageHtml = '<!DOCTYPE html><html><body>' + overview2 + '</body></html>'
                # html = etree.fromstring(pageHtml)
                # overview2 = html.xpath("//li//text()")
                overview2 = "<h2>KEY FEATURES</h2>" + ''.join(overview2)
            elif overview2 is None:
                overview2 = ""
            # print("===overview2: ", overview2)
            overview3 = informationDict.get("course_finder_data").get(
                "accreditation_intro").get("content").get("override")
            if overview3 is not None:
                if "<p" in overview3:
                    delFu = re.findall(r"&\w+;", overview3)
                    # print(delFu)
                    if len(delFu) != 0:
                        for d in delFu:
                            overview3 = overview3.replace(d, " ")
                    # pageHtml = '<!DOCTYPE html><html><body>' + overview3 + '</body></html>'
                    # html = etree.fromstring(pageHtml)
                    # overview3 = html.xpath("//p//text()")
                    overview3 = "<h2>ACCREDITATION</h2>" + ''.join(overview3)
            elif overview3 is None:
                overview3 = ""
            # print("===overview3: ", overview3)
            overview4 = informationDict.get("course_finder_data").get(
                "suitable_for").get("content").get("override")
            if overview4 is not None:
                if "<p" in overview4:
                    delFu = re.findall(r"&\w+;", overview4)
                    # print(delFu)
                    if len(delFu) != 0:
                        for d in delFu:
                            overview4 = overview4.replace(d, " ")
                    # pageHtml = '<!DOCTYPE html><html><body>' + overview4 + '</body></html>'
                    # html = etree.fromstring(pageHtml)
                    # overview4 = html.xpath("//p//text()")
                    overview4 = "<h2>SUITABLE FOR</h2><div>" + ''.join(
                        overview4) + "</div>"
            elif overview4 is None:
                overview4 = ""

            # print("===overview4: ", overview4)
            overview_en = "<h2>Overview</h2>\n" + remove_class(clear_lianxu_space([overview1])) + remove_class(clear_lianxu_space([overview2])) \
                                  + remove_class(clear_lianxu_space([overview3])) + remove_class(clear_lianxu_space([overview4]))
            # print("item['overview_en']: ", item['overview_en'])

            # 就业方向
            career1 = informationDict.get("course_finder_data").get(
                "career_opportunities").get("content").get("default")
            # print("career1: ", career1)
            if career1 is None:
                career1 = ""
            # print("===career1: ", career1)
            career2 = informationDict.get("course_finder_data").get(
                "careers").get("content").get("override")
            # print("career2: ", career2)
            if career2 is not None:
                delFu = re.findall(r"&\w+;", career2)
                # print(delFu)
                if len(delFu) != 0:
                    for d in delFu:
                        career2 = career2.replace(d, " ")
                career2 = career2.replace("&", " ")
                # pageHtml = '<!DOCTYPE html><html><body>' + career2 + '</body></html>'
                # html = etree.fromstring(pageHtml)
                # c2 = html.xpath("//p//text()")
                # career2 = html.xpath("//li//text()")
                career2 = "<h2>PROFESSIONS</h2>" + career2
            elif career2 is None:
                career2 = ""
            # print("===career2: ", career2)
            career3 = informationDict.get("course_finder_data").get(
                "employer_types").get("content").get("override")
            if career3 is not None:
                delFu = re.findall(r"&\w+;", career3)
                # print(delFu)
                if len(delFu) != 0:
                    for d in delFu:
                        career3 = career3.replace(d, " ")
                career3 = career3.replace("&", " ")
                # pageHtml = '<!DOCTYPE html><html><body>' + career3 + '</body></html>'
                # html = etree.fromstring(pageHtml)
                # c3 = html.xpath("//p//text()")
                # career3 = html.xpath("//li//text()")
                career3 = "<h2>EMPLOYERS</h2>" + career3
            elif career3 is None:
                career3 = ""
            # print("===career3: ", career3)
            career_en = remove_class(clear_lianxu_space(
                [career1])) + remove_class(clear_lianxu_space(
                    [career2])) + remove_class(clear_lianxu_space([career3]))
            # print("item['career_en']: ", item['career_en'])

            modules1 = informationDict.get("course_finder_data").get(
                "course_structure_intro").get("content").get("default")
            if modules1 is not None:
                modules1 = "<h2>What You'll Study</h2>" + modules1
            else:
                modules1 = ""
            # print("===modules1: ", modules1)
            modules2 = informationDict.get("course_finder_data").get(
                "units_intro").get("content").get("default")
            if modules2 is not None:
                modules2 = "<h2>Units</h2>" + modules2
            else:
                modules2 = ""

            # print("===modules2: ", modules2)
            modules3 = informationDict.get("course_finder_data").get(
                "general_requirements").get("content").get("default")
            if modules3 is not None:
                modules3 = "<h2>GENERAL REQUIREMENTS</h2>" + modules3
            else:
                modules3 = ""

            # print("===modules3: ", modules3)
            modu4 = informationDict.get("handbook_detail_data").get("GenReqs")
            modules4 = ""
            if modu4 is not None:
                for m4 in modu4:
                    DegreeReq = m4.get("DegreeReq")
                    DegreeReqCP = m4.get("DegreeReqCP")
                    if DegreeReq is None:
                        DegreeReq = ""
                    if DegreeReqCP is None:
                        DegreeReqCP = ""
                    modules4 += "<li>" + DegreeReq + " - " + DegreeReqCP + "</li>\n"
                modules4 = "<ul>" + modules4 + "</ul>"

            # print("===modules4: ", modules4)
            modu5 = informationDict.get("course_finder_data").get("units").get(
                "content").get("override")
            modules5 = ""
            print('modu5=', modu5)
            if len(modu5) > 0:
                level1 = modu5.get("100 level")
                # print('level1=', level1)
                m5_1 = ""
                if level1 is not None:
                    for l1 in level1:
                        u1 = l1.get("units")
                        for u in u1:
                            name = u.get("name")
                            cp = u.get("cp")
                            if name is None:
                                name = u.get('code')
                                if name is None:
                                    name = ""
                            if cp is None:
                                cp = ""
                            m5_1 += "<li>" + name + " - " + cp + "</li>\n"
                        # print('m5_1=', m5_1)
                level2 = modu5.get("200 level")
                m5_2 = ""
                if level2 is not None:
                    for l1 in level2:
                        u1 = l1.get("units")
                        for u in u1:
                            name = u.get("name")
                            cp = u.get("cp")
                            if name is None:
                                name = u.get('code')
                                if name is None:
                                    name = ""
                            if cp is None:
                                cp = ""
                            m5_2 += "<li>" + name + " - " + cp + "</li>\n"
                level3 = modu5.get("300 level")
                m5_3 = ""
                if level3 is not None:
                    for l1 in level3:
                        u1 = l1.get("units")
                        for u in u1:
                            name = u.get("name")
                            cp = u.get("cp")
                            if name is None:
                                name = u.get('code')
                                if name is None:
                                    name = ""
                            if cp is None:
                                cp = ""
                            m5_3 += "<li>" + name + " - " + cp + "</li>\n"
                level4 = modu5.get("800 level")
                m5_4 = ""
                if level4 is not None:
                    for l1 in level4:
                        u1 = l1.get("units")
                        for u in u1:
                            name = u.get("name")
                            cp = u.get("cp")
                            if name is None:
                                name = u.get('code')
                                if name is None:
                                    name = ""
                            if cp is None:
                                cp = ""
                            m5_4 += "<li>" + name + " - " + cp + "</li>\n"
                level5 = modu5.get("600 level")
                m5_5 = ""
                if level5 is not None:
                    for l1 in level5:
                        u1 = l1.get("units")
                        for u in u1:
                            name = u.get("name")
                            cp = u.get("cp")
                            if name is None:
                                name = u.get('code')
                                if name is None:
                                    name = ""
                            if cp is None:
                                cp = ""
                            m5_5 += "<li>" + name + " - " + cp + "</li>\n"
                level6 = modu5.get("level700")
                m5_6 = ""
                if level6 is not None:
                    for l1 in level6:
                        u1 = l1.get("units")
                        for u in u1:
                            name = u.get("name")
                            cp = u.get("cp")
                            if name is None:
                                name = u.get('code')
                                if name is None:
                                    name = ""
                            if cp is None:
                                cp = ""
                            m5_6 += "<li>" + name + " - " + cp + "</li>\n"
                level7 = modu5.get("500 level")
                m5_7 = ""
                if level7 is not None:
                    for l1 in level7:
                        u1 = l1.get("units")
                        for u in u1:
                            name = u.get("name")
                            cp = u.get("cp")
                            if name is None:
                                name = u.get('code')
                                if name is None:
                                    name = ""
                            if cp is None:
                                cp = ""
                            m5_7 += "<li>" + name + " - " + cp + "</li>\n"
                modules5 = "<h2>SPECIFIC REQUIREMENTS<h2>\n<ul>" + m5_1 + m5_2 + m5_3 + m5_4 + m5_5 + m5_6 + m5_7 + "</ul>"
            print("===modules5: ", modules5)
            modules_en = remove_class(
                clear_lianxu_space([modules1])) + remove_class(
                    clear_lianxu_space([modules2])) + remove_class(
                        clear_lianxu_space([modules3])) + remove_class(
                            clear_lianxu_space([modules4])) + remove_class(
                                clear_lianxu_space([modules5]))
            item['modules_en'] = modules_en
            print("item['modules_en']: ", item['modules_en'])

            entry0 = informationDict.get("handbook_detail_data").get(
                "AdditionalMetaData").get("DegAtarInternational")
            if entry0 is None:
                entry0 = ""
            entry0 = "<h2>Entry Requirements</h2>\n" + entry0
            # print("===entry0: ", entry0)

            entry1 = informationDict.get("course_finder_data").get(
                "entry_req_desc").get("content").get("default")
            # print("overview1: ", overview1)
            if entry1 is not None:
                if "<p" in entry1:
                    delFu = re.findall(r"&\w+;", entry1)
                    # print(delFu)
                    if len(delFu) != 0:
                        for d in delFu:
                            entry1 = entry1.replace(d, " ")
                    # pageHtml = '<!DOCTYPE html><html><body>' + entry1 + '</body></html>'
                    # html = etree.fromstring(pageHtml)
                    # entry1 = html.xpath("//p//text()")
            elif entry1 is None:
                entry1 = ""
            # print("===entry1: ", entry1)
            entry11 = informationDict.get("handbook_detail_data").get(
                "AdditionalMetaData").get("AdmissionRequirement")
            if entry11 is None:
                entry11 = ""
            entry11 = "<h2>MINIMUM ADMISSION REQUIREMENT</h2>\n<p>" + entry11 + "</p>"
            # print("===entry11: ", entry11)
            entry2 = informationDict.get("course_finder_data").get(
                "assumed_knowledge").get("content").get("override")
            if entry2 is not None:
                if "<p" in entry2:
                    delFu = re.findall(r"&\w+;", entry2)
                    # print(delFu)
                    if len(delFu) != 0:
                        for d in delFu:
                            entry2 = entry2.replace(d, " ")
                    # pageHtml = '<!DOCTYPE html><html><body>' + entry2 + '</body></html>'
                    # html = etree.fromstring(pageHtml)
                    # entry2 = html.xpath("//p//text()")
                entry2 = "<h2>ASSUMED KNOWLEDGE</h2>\n<p>" + entry2 + "</p>"
            elif entry2 is None:
                entry2 = ""
            # print("===entry2: ", entry2)
            entry3 = informationDict.get("course_finder_data").get(
                "alt_entry").get("content").get("default")
            # print("overview1: ", overview1)
            if entry3 is not None:
                if "<p" in entry3:
                    delFu = re.findall(r"&\w+;", entry3)
                    # print(delFu)
                    if len(delFu) != 0:
                        for d in delFu:
                            entry3 = entry3.replace(d, " ")
                    # pageHtml = '<!DOCTYPE html><html><body>' + entry3 + '</body></html>'
                    # html = etree.fromstring(pageHtml)
                    # entry3 = html.xpath("//p//text()")
            elif entry3 is None:
                entry3 = ""
            # print("===entry3: ", entry3)
            rntry_requirements_en = remove_class(clear_lianxu_space([entry0])) + remove_class(clear_lianxu_space([entry1])) + remove_class(clear_lianxu_space([entry11]))+ \
                                            remove_class(clear_lianxu_space([entry2])) + remove_class(clear_lianxu_space([entry3]))
            # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

            ielts = informationDict.get("handbook_detail_data").get(
                "AdditionalMetaData").get("EnglishProficiency")
            if ielts is None:
                ielts = ""
            ielts_desc = ielts.strip()
            # print("===item['ielts_desc']: ", item['ielts_desc'])

            how_to_apply1 = informationDict.get("course_finder_data").get(
                "how_to_apply").get("content").get("override")
            # print("overview1: ", overview1)
            if how_to_apply1 is not None:
                if "<p" in how_to_apply1:
                    delFu = re.findall(r"&\w+;", how_to_apply1)
                    # print(delFu)
                    if len(delFu) != 0:
                        for d in delFu:
                            how_to_apply1 = how_to_apply1.replace(d, " ")
                    # pageHtml = '<!DOCTYPE html><html><body>' + how_to_apply1 + '</body></html>'
                    # html = etree.fromstring(pageHtml)
                    # how_to_apply1 = html.xpath("//p//text()")
                    how_to_apply1 = "<h2>What you'll need to apply</h2>\n" + ''.join(
                        how_to_apply1)
            elif how_to_apply1 is None:
                how_to_apply1 = ""

            apply_desc_en = remove_class(clear_lianxu_space([how_to_apply1]))
            # print("item['apply_desc_en']: ", item['apply_desc_en'])

            programme_dict_all = {
                "degree_name": degree_name,
                "programme_en": programme_en,
                "department": department,
                "duration": duration,
                "location": location,
                "start_date": start_date,
                "tuition_fee": tuition_fee,
                "overview_en": overview_en,
                "career_en": career_en,
                "modules_en": modules_en,
                "rntry_requirements_en": rntry_requirements_en,
                "ielts_desc": ielts_desc,
                "apply_desc_en": apply_desc_en
            }
            return programme_dict_all

Пример #21

Показать файл

Файл: UniversityofTechnologySydney_U.py Проект: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "University of Technology Sydney"
        # item['country'] = 'Australia'
        # item['website'] = 'https://www.uts.edu.au'
        item['url'] = response.url
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            # 2019/03/21 div的class值多了个空格，使用contains
            programme = response.xpath(
                '//div[@class="field-item"]/div[contains(@class,"page-title")]/h1//text()'
            ).extract()
            clear_space(programme)
            programme = ''.join(programme).strip()
            item['degree_name'] = programme
            print("item['degree_name']: ", item['degree_name'])

            de_p = re.findall(r"\(.+\)", item['degree_name'])
            if len(de_p) > 0:
                de_s = ''.join(de_p).strip()
                if de_s != "(Honours)":
                    item['programme_en'] = de_s.replace("(", "").replace(
                        ")", "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace(
                        "Bachelor of", "").replace("(Honours)", "").strip()
            else:
                item['programme_en'] = item['degree_name'].replace(
                    "Bachelor of", "").replace("(Honours)", "").strip()
            pro_re = re.findall(r"in\s.*", item['degree_name'])
            if len(pro_re) > 0:
                de_s = ''.join(pro_re).replace("in", "").strip()
                item['programme_en'] = de_s
            print("item['programme_en']: ", item['programme_en'])

            start_date = response.xpath(
                "//dt[contains(text(),'UAC')]/following-sibling::dd/span//text()"
            ).extract()
            clear_space(start_date)
            print(start_date)
            if len(start_date) > 0:
                start_date_re = re.findall(r"\w+\ssession",
                                           ' '.join(start_date))
                start_date_re = list(set(start_date_re))
                print("start_date_re: ", start_date_re)
                item['start_date'] = ','.join(start_date_re).replace(
                    "(", "").replace(")", "").replace(" session", "").strip()
            print("item['start_date']: ", item['start_date'])

            overview = response.xpath(
                '//div[@class="field field-dddd-view-modeluts-course-course__overview field-type-ds field-label-hidden"]'
            ).extract()
            item['degree_overview_en'] = remove_class(
                clear_lianxu_space(overview))
            # print("item['degree_overview_en']: ", item['degree_overview_en'])

            career = response.xpath(
                '//div[@class="field field-dddd-view-modeluts-course-course__careers field-type-ds field-label-hidden"]'
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            modules = response.xpath(
                "//div[@class='course__structure']").extract()
            if len(modules) == 0:
                print("*********")
                # modules = response.xpath("//div[@class='course__structure']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            print("item['modules_en']: ", item['modules_en'])

            location = response.xpath(
                '//div[@class="field field-dddd-view-modeluts-course-course__location field-type-ds field-label-hidden"]//p//text()'
            ).extract()
            clear_space(location)
            location = ''.join(location).strip()
            item['location'] = location
            # print("item['location']: ", item['location'])

            duration = response.xpath(
                '//div[@class="field field-dddd-view-modeluts-course-course__duration field-type-ds field-label-hidden"]//p//text()'
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            if len(duration) > 0:
                duration = duration[0]
            # print(duration)
            if "or" in duration:
                duration = duration.strip("or").strip()
            mode = re.findall("\w+\stime$", duration)
            # print(mode)
            mode = ''.join(mode)
            # item['mode'] = mode
            # print("item['mode']: ", item['mode'])
            item['duration'] = ''.join(duration.replace(mode, "").strip())
            # print("item['duration']: ", item['duration'])

            # feeDict = {'C04006v6': '15000', 'C04007v7': '15000', 'C04008v6': '15000', 'C04018v6': '19015', 'C04037v6': '17570', 'C04038v8': '18650', 'C04048v7': '18650', 'C04052v4': '19770', 'C04055v4': '15585', 'C04067v7': '18650', 'C04090v5': '17930', 'C04094v5': '17570', 'C04097v2': '17930', 'C04098v3': '17570', 'C04106v5': '16005', 'C04108v3': '14790', 'C04109v7': '14790', 'C04140v11': '16005', 'C04143v8': '20575', 'C04145v4': '20575', 'C04147v5': '22280', 'C04149v4': '21415', 'C04157v8': '19770', 'C04158v4': '19015', 'C04160v7': '20985', 'C04203v4': '14790', 'C04210v1': '16895', 'C04218v5': '19770', 'C04222v1': '19770', 'C04224v4': '20985', 'C04226v4': '17570', 'C04227v3': '17570', 'C04228v2': '16005', 'C04229v3': '17570', 'C04231v2': '15145', 'C04232v3': '15145', 'C04234v1': '19770', 'C04235v2': '17570', 'C04236v3': '22280', 'C04237v3': '18650', 'C04238v3': '18650', 'C04239v2': '14150', 'C04241v2': '18280', 'C04242v1': '20575', 'C04243v3': '17270', 'C04244v1': '13520', 'C04245v1': '14790', 'C04246v2': '16005', 'C04248v1': '16280', 'C04250v2': '22280', 'C04251v1': '20575', 'C04252v2': '19015', 'C04253v2': '19015', 'C04254v1': '14790', 'C04255v1': '12300', 'C04257v1': '11145', 'C04258v3': '18650', 'C04259v2': '18650', 'C04260v2': '18650', 'C04261v2': '18650', 'C04262v1': '14790', 'C04264v1': '22280', 'C04265v2': '18280', 'C04266v1': '17270', 'C04267v1': '18280', 'C04268v1': '13340', 'C04269v2': '13340', 'C04270v1': '17570', 'C04271v2': '17930', 'C04272v2': '17570', 'C04273v2': '17930', 'C04274v1': '17570', 'C04275v1': '17570', 'C04277v2': '17930', 'C04278v2': '17570', 'C04279v2': '16005', 'C04281v2': '18650', 'C04284v2': '14150', 'C04285v1': '15000', 'C04286v1': '18650', 'C04287v1': '18650', 'C04288v1': '15000', 'C04289v1': '18650', 'C04290v1': '15000', 'C04291v1': '14150', 'C04292v1': '16005', 'C04293v2': '17930', 'C04294v1': '15585', 'C04295v2': '19770', 'C04296v2': '19015', 'C04297v2': '19770', 'C04298v1': '14790', 'C04299v1': '18280', 'C04300v1': '18650', 'C04301v1': '15000', 'C04302v1': '16005', 'C04303v1': '16005', 'C04304v3': '19015', 'C04305v1': '14415', 'C04306v1': '25070', 'C04307v1': '14415', 'C04309v2': '17930', 'C04314v1': '18650', 'C04315v1': '15585', 'C04316v2': '15000', 'C04317v1': '15000', 'C04319v1': '15000', 'C04320v1': '22280', 'C04321v1': '16565', 'C04322v1': '16005', 'C04323v1': '15585', 'C04324v2': '18650', 'C04325v2': '18650', 'C04368v1': '16895', 'C04369v1': '16895', 'C04371v1': '16895', 'C04372v1': '17930', 'C04373v1': '18650', 'C06006v5': '15000', 'C06009v8': '19015', 'C06017v7': '15000', 'C06033v4': '11145', 'C06037v4': '16005', 'C06041v6': '14790', 'C06058v7': '19770', 'C06096v3': '14415', 'C06097v1': '16895', 'C06099v1': '20575', 'C06100v2': '19015', 'C06101v1': '14790', 'C06102v1': '14790', 'C06103v1': '14790', 'C06104v1': '16565', 'C06105v1': '14790', 'C06106v1': '14790', 'C06107v1': '13340', 'C06108v1': '17930', 'C06109v1': '17570', 'C06110v1': '17570', 'C06113v1': '19770', 'C06114v2': '17930', 'C06115v2': '15000', 'C06116v1': '14415', 'C06118v2': '25070', 'C06119v1': '15585', 'C06121v1': '15585', 'C06122v1': '19015', 'C06123v1': '19770', 'C06124v1': '17930', 'C07002v7': '15000', 'C07004v4': '15000', 'C07012v7': '18650', 'C07018v5': '18650', 'C07019v6': '15000', 'C07021v8': '18650', 'C07027v8': '14150', 'C07028v9': '14150', 'C07029v7': '15000', 'C07044v4': '16005', 'C07048v7': '16005', 'C07073v5': '22280', 'C07074v5': '22280', 'C07075v4': '18280', 'C07078v3': '19015', 'C07080v7': '20985', 'C07107v3': '13520', 'C07112v4': '18650', 'C07113v3': '18650', 'C07118v1': '14790', 'C07119v1': '17270', 'C07120v2': '16895', 'C07122v1': '22280', 'C07124v1': '16005', 'C07125v1': '14790', 'C07126v1': '16005', 'C07128v1': '18650', 'C07129v1': '18650', 'C07132v1': '18650', 'C11001v5': '15000', 'C11005v5': '15000', 'C11008v7': '19015', 'C11015v8': '18650', 'C11017v5': '17570', 'C11021v5': '18650', 'C11027v5': '18650', 'C11039v4': '18650', 'C11048v3': '17930', 'C11051v3': '17570', 'C11054v2': '17570', 'C11125v4': '20575', 'C11128v3': '16005', 'C11130v4': '20575', 'C11142v7': '19770', 'C11145v7': '20985', 'C11198v3': '18650', 'C11199v4': '18650', 'C11206v3': '18650', 'C11210v2': '17270', 'C11211v2': '22280', 'C11215v4': '11145', 'C11216v1': '18280', 'C11217v1': '20575', 'C11223v1': '14790', 'C11225v1': '17270', 'C11227v1': '16895', 'C11229v1': '20575', 'C11230v2': '19015', 'C11232v1': '18280', 'C11234v1': '17270', 'C11235v1': '13340', 'C11236v1': '17930', 'C11237v1': '17570', 'C11238v1': '17930', 'C11239v1': '17570', 'C11242v1': '16005', 'C11245v1': '15000', 'C11247v1': '19770', 'C11248v1': '17930', 'C11249v2': '15000', 'C11254v1': '14415', 'C11257v1': '15000', 'C11260v2': '25070', 'C11262v1': '16005', 'C11264v1': '22280', 'C11265v1': '20575', 'C11270v1': '15000', 'C11271v1': '15000', 'C11274v1': '17930', 'C01001v2': '12810', 'C01002v2': '12810', 'C01003v2': '12810', 'C01004v2': '12810', 'C01005v2': '12810', 'C02001v2': '13850', 'C02018v5': '17570', 'C02019v3': '12810', 'C02020v2': '12810', 'C02024v4': '16005', 'C02025v5': '13520', 'C02026v4': '13520', 'C02028v6': '15000', 'C02029v4': '16280', 'C02030v3': '18280', 'C02031v3': '18280', 'C02037v4': '12810', 'C02039v3': '13340', 'C02041v4': '12810', 'C02047v1': '16280', 'C02048v4': '16005', 'C02050v1': '12810', 'C02056v1': '15000', 'C02057v1': '16005', 'C02058v2': '16005', 'C02059v1': '15000', 'C02060v1': '15000', 'C02061v1': '16005', 'C02062v1': '16005', 'C02063v1': '15000', 'C03001v4': '13850', 'C03002v5': '13850', 'C03012v4': '13850', 'C03017v5': '17570', 'C03018v3': '12810', 'C03024v7': '15000', 'C03025v4': '16280', 'C03026v6': '18280', 'C03029v4': '18280', 'C03032v4': '12810', 'C03034v3': '13340', 'C03044v2': '12810', 'C03046v3': '16005', 'C03047v2': '12810', 'C03048v3': '16005', 'C03049v3': '16005', 'C03050v3': '16005', 'C03051v1': '16280', 'C03053v1': '15000', 'C03054v1': '15000', 'C03055v1': '16005', 'C03056v1': '15000', 'C03057v1': '15000', 'C03058v1': '16005', 'C03059v1': '15000'}
            feeDict = {}
            cod = [
                "C09004v6",
                "C09018v6",
                "C09019v4",
                "C09020v7",
                "C09022v3",
                "C09023v3",
                "C09026v3",
                "C09029v3",
                "C09031v3",
                "C09035v4",
                "C09046v2",
                "C09047v1",
                "C09048v2",
                "C09049v1",
                "C09050v1",
                "C09052v2",
                "C09055v2",
                "C09056v1",
                "C09057v2",
                "C09059v2",
                "C09060v1",
                "C09061v1",
                "C09063v2",
                "C09064v1",
                "C09066v2",
                "C09067v2",
                "C09068v2",
                "C09069v2",
                "C09070v2",
                "C09071v2",
                "C09072v2",
                "C09073v2",
                "C09074v2",
                "C09075v2",
                "C09076v2",
                "C09077v1",
                "C09078v1",
                "C09079v3",
                "C09081v1",
                "C09082v1",
                "C09083v2",
                "C09084v1",
                "C09085v2",
                "C09086v1",
                "C09087v2",
                "C09088v1",
                "C09089v2",
                "C09091v2",
                "C09093v2",
                "C09094v2",
                "C09095v2",
                "C09096v2",
                "C09097v1",
                "C09098v2",
                "C09099v1",
                "C09101v1",
                "C09119v1",
                "C09120v1",
                "C09121v1",
                "C10004v6",
                "C10007v8",
                "C10011v5",
                "C10020v4",
                "C10021v4",
                "C10026v4",
                "C10027v4",
                "C10039v10",
                "C10040v8",
                "C10044v7",
                "C10045v9",
                "C10054v5",
                "C10055v8",
                "C10056v5",
                "C10059v8",
                "C10061v6",
                "C10062v5",
                "C10063v6",
                "C10065v10",
                "C10066v7",
                "C10067v7",
                "C10068v9",
                "C10073v7",
                "C10074v6",
                "C10075v7",
                "C10076v7",
                "C10078v7",
                "C10079v6",
                "C10098v2",
                "C10115v9",
                "C10122v11",
                "C10123v7",
                "C10124v8",
                "C10125v9",
                "C10126v8",
                "C10129v6",
                "C10131v6",
                "C10132v4",
                "C10136v9",
                "C10137v4",
                "C10148v4",
                "C10152v4",
                "C10155v9",
                "C10157v6",
                "C10158v5",
                "C10162v6",
                "C10163v4",
                "C10164v7",
                "C10167v4",
                "C10169v5",
                "C10172v7",
                "C10174v5",
                "C10184v6",
                "C10186v9",
                "C10206v6",
                "C10208v5",
                "C10209v7",
                "C10214v3",
                "C10215v3",
                "C10219v4",
                "C10223v2",
                "C10224v3",
                "C10226v6",
                "C10227v4",
                "C10228v5",
                "C10229v4",
                "C10239v1",
                "C10242v2",
                "C10243v2",
                "C10244v2",
                "C10245v3",
                "C10246v1",
                "C10247v1",
                "C10248v1",
                "C10250v1",
                "C10251v1",
                "C10252v2",
                "C10253v2",
                "C10254v2",
                "C10255v1",
                "C10256v2",
                "C10257v2",
                "C10258v3",
                "C10259v3",
                "C10260v3",
                "C10261v3",
                "C10262v2",
                "C10263v3",
                "C10264v2",
                "C10265v4",
                "C10266v4",
                "C10269v2",
                "C10270v2",
                "C10271v4",
                "C10272v3",
                "C10273v2",
                "C10274v2",
                "C10275v1",
                "C10276v1",
                "C10277v1",
                "C10300v2",
                "C10301v2",
                "C10302v2",
                "C10303v2",
                "C10304v2",
                "C10305v2",
                "C10306v1",
                "C10307v1",
                "C10308v1",
                "C10309v1",
                "C10310v1",
                "C10311v1",
                "C10312v1",
                "C10313v1",
                "C10314v1",
                "C10315v1",
                "C10316v1",
                "C10317v1",
                "C10318v1",
                "C10319v1",
                "C10320v1",
                "C10321v2",
                "C10322v3",
                "C10323v2",
                "C10324v2",
                "C10325v3",
                "C10326v2",
                "C10327v2",
                "C10328v2",
                "C10330v2",
                "C10332v1",
                "C10333v1",
                "C10334v1",
                "C10335v1",
                "C10336v1",
                "C10337v1",
                "C10338v2",
                "C10339v1",
                "C10341v1",
                "C10342v2",
                "C10343v1",
                "C10345v1",
                "C10346v1",
                "C10347v2",
                "C10348v1",
                "C10349v3",
                "C10350v3",
                "C10351v2",
                "C10352v3",
                "C10353v2",
                "C10354v2",
                "C10355v3",
                "C10356v2",
                "C10359v2",
                "C10360v1",
                "C10361v1",
                "C10362v1",
                "C10363v1",
                "C10364v1",
                "C10365v1",
                "C10366v1",
                "C10367v1",
                "C10368v1",
                "C10369v1",
                "C10370v1",
                "C10371v1",
                "C10372v1",
                "C10373v2",
                "C10374v2",
                "C10375v2",
                "C10376v2",
                "C10377v2",
                "C10378v1",
                "C10379v1",
                "C10380v1",
                "C10381v1",
                "C10382v1",
                "C10383v1",
                "C10384v1",
                "C10385v1",
                "C10386v1",
                "C10387v1",
                "C10388v1",
                "C10389v2",
                "C10390v1",
                "C10391v1",
                "C10392v1",
                "C20049v1",
                "C20056v1",
                "C20059v1",
                "C20060v1 ",
            ]
            fee = [
                "18130",
                "18130",
                "20340",
                "18825",
                "18825",
                "18825",
                "18825",
                "18825",
                "18825",
                "18825",
                "18825",
                "16800",
                "18445",
                "15525",
                "18825",
                "18130",
                "18130",
                "18130",
                "15525",
                "18130",
                "18130",
                "18130",
                "15750",
                "18130",
                "21180",
                "21180",
                "19960",
                "19960",
                "19960",
                "19960",
                "19960",
                "19960",
                "19960",
                "19960",
                "19960",
                "18825",
                "18825",
                "18445",
                "17390",
                "15900",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "17735",
                "18130",
                "20340",
                "21180",
                "21180",
                "18445",
                "15750",
                "15525",
                "18130",
                "15525",
                "18130",
                "17390",
                "16800",
                "16800",
                "15525",
                "15525",
                "17090",
                "18130",
                "15750",
                "18130",
                "21180",
                "16800",
                "16800",
                "19960",
                "19960",
                "21180",
                "19960",
                "19960",
                "16800",
                "16800",
                "19960",
                "16800",
                "16800",
                "14195",
                "18825",
                "18130",
                "18130",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "17735",
                "21180",
                "17735",
                "20340",
                "20340",
                "17735",
                "17735",
                "17735",
                "18825",
                "18825",
                "17735",
                "18825",
                "18825",
                "18825",
                "18825",
                "18825",
                "17735",
                "15900",
                "15900",
                "15900",
                "15750",
                "15750",
                "20340",
                "18825",
                "17735",
                "18130",
                "18825",
                "18825",
                "20340",
                "20340",
                "18825",
                "18825",
                "18825",
                "21180",
                "17390",
                "17390",
                "15135",
                "15135",
                "15135",
                "17390",
                "17390",
                "15135",
                "14195",
                "15135",
                "15135",
                "20340",
                "20340",
                "20340",
                "20340",
                "17735",
                "20340",
                "16085",
                "18130",
                "18130",
                "18445",
                "18445",
                "18130",
                "18130",
                "18130",
                "18130",
                "18825",
                "19190",
                "19190",
                "15525",
                "15525",
                "15525",
                "15525",
                "18130",
                "18130",
                "18130",
                "18130",
                "18130",
                "18130",
                "15750",
                "15135",
                "15135",
                "20340",
                "15135",
                "15135",
                "20340",
                "15135",
                "15135",
                "20340",
                "15750",
                "18130",
                "18130",
                "18130",
                "18130",
                "18445",
                "18130",
                "20340",
                "15525",
                "18825",
                "17390",
                "17390",
                "17390",
                "17390",
                "17390",
                "17390",
                "21180",
                "16800",
                "18445",
                "17390",
                "17390",
                "20340",
                "18825",
                "19190",
                "18130",
                "15900",
                "15900",
                "18130",
                "19190",
                "18825",
                "18825",
                "17390",
                "18130",
                "16800",
                "15525",
                "19190",
                "19190",
                "16800",
                "16800",
                "19190",
                "19190",
                "16800",
                "16800",
                "16800",
                "16800",
                "16800",
                "16800",
                "19190",
                "16800",
                "16800",
                "19190",
                "16800",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "17735",
                "17735",
                "21180",
                "18825",
                "18825",
                "18825",
                "16085",
                "21180",
                "15900",
                "20755",
                "20755",
                "14855",
                "17090",
            ]
            for i in range(len(cod)):
                feeDict[cod[i]] = fee[i]
            # //div[@class='sidebar__info sidebar--info-codes']//dl/dd[1]
            feeIndex = response.xpath(
                "//div[@class='sidebar__info sidebar--info-codes']//dl/dd[1]//text()"
            ).extract()
            clear_space(feeIndex)
            print("---", feeIndex)
            v_re = re.findall(r"version\s\d", ''.join(feeIndex))
            print(v_re, "***")
            if feeIndex:
                feeIndexe = feeIndex[1] + ''.join(v_re).replace(
                    "version ", "v").strip()
                print('===', feeIndexe)
                item['tuition_fee'] = feeDict.get(feeIndexe)
            # feeIndex = ''.join(feeIndex)
            # print(feeIndex)
            # item['tuition_fee'] = feeDict.get(feeIndex)
            print("item['tuition_fee']: ", item['tuition_fee'])

            # //h4[@class='collapsible__title'][contains(text(),'Admission requirements')]/following-sibling::div[1]
            entry_requirements = response.xpath(
                "//h4[@class='collapsible__title'][contains(text(),'Admission requirements')]/following-sibling::div[1]"
            ).extract()
            entry_requirements_str = ''.join(entry_requirements).strip()
            item['rntry_requirements_en'] = remove_class(
                clear_lianxu_space(entry_requirements))
            print("item['rntry_requirements_en']: ",
                  item['rntry_requirements_en'])

            ieltsRe = re.findall(r"IELTS[0-9a-zA-Z:\.,\s]*;",
                                 entry_requirements_str)
            # print("ieltsRe: ", ieltsRe)
            toeflRe = re.findall(r"internet\sbased[0-9a-zA-Z:\.,\s-]*;",
                                 entry_requirements_str)
            # print("toeflRe: ", toeflRe)
            item['ielts_desc'] = ''.join(ieltsRe).strip()
            print("item['ielts_desc']: ", item['ielts_desc'])

            item['toefl_desc'] = ''.join(toeflRe).strip()
            print("item['toefl_desc']: ", item['toefl_desc'])

            # ieltsDict = {"Bachelor of Arts ": "7.5 overall,speaking 8.0,listening 8.0,reading 7.0,writing 7.0",
            #              "Bachelor of Education": "7.5 overall,speaking 8.0,listening 8.0,reading 7.0,writing 7.0",
            #              "Bachelor of Arts ": "7.5 overall,speaking 8.0,listening 8.0,reading 7.0,writing 7.0",
            #              "Bachelor of Education (Honours)": "7.5 overall,speaking 8.0,listening 8.0,reading 7.0,writing 7.0",
            #              "Bachelor of Education ": "7.5 overall,speaking 8.0,listening 8.0,reading 7.0,writing 7.0",
            #              "Bachelor of Arts in International Studies": "7.5 overall,speaking 8.0,listening 8.0,reading 7.0,writing 7.0",
            #              "Bachelor of Design (Honours) in Animation": "7.0 overall,writing 7.0",
            #              "Bachelor of Communication (Honours)": "7.0 overall,writing 7.0",
            #              "Bachelor of Education (Honours) in Primary Education": "7.0 overall,writing 7.0",
            #              "Bachelor of Nursing": "6.5 overall, writing 6.0",
            #              "Bachelor of Nursing ": "6.5 overall, writing 6.0",
            #              "Bachelor of Arts in International Studies": "6.5 overall, writing 6.0", }
            # if item['ielts_desc'] == "":
            #     item['ielts_desc'] = ieltsDict.get(item['degree_name'])
            #     if item['ielts_desc'] is None:
            #         item['ielts_desc'] = "6.5 overall, writing 6.0"
            # # print("item['ielts_desc']: ", item['ielts_desc'])
            #
            # toeflDict = {
            #     "Bachelor of Arts ": "102-109 overall, speaking 23-27, listening 23-27, reading 23-27, writing 24",
            #     "Bachelor of Education": "102-109 overall, speaking 23-27, listening 23-27, reading 23-27, writing 24 ",
            #     "Bachelor of Arts ": "102-109 overall, speaking 23-27, listening 23-27, reading 23-27, writing 24 ",
            #     "Bachelor of Education (Honours)": "102-109 overall, speaking 23-27, listening 23-27, reading 23-27, writing 24 ",
            #     "Bachelor of Education ": "102-109 overall, speaking 23-27, listening 23-27, reading 23-27, writing 24 ",
            #     "Bachelor of Arts in International Studies": "102-109 overall, speaking 23-27, listening 23-27, reading 23-27, writing 24 ",
            #     "Bachelor of Design (Honours) in Animation": "94-101 overall, writing 23  ",
            #     "Bachelor of Communication (Honours)": "94-101 overall, writing 23  ",
            #     "Bachelor of Education (Honours) in Primary Education": "94-101 overall, writing 23 ",
            #     "Bachelor of Nursing": "79-93 overall,writing 21 ",
            #     "Bachelor of Nursing ": "79-93 overall,writing 21 ",
            #     "Bachelor of Arts in International Studies": "79-93 overall,writing 21 ", }
            # if item['toefl_desc'] == "":
            #     item['toefl_desc'] = toeflDict.get(item['degree_name'])
            #     if item['toefl_desc'] is None:
            #         item['toefl_desc'] = "79-93 overall, writing 21"
            # # print("item['toefl_desc']: ", item['toefl_desc'])

            ielts_d = get_ielts(item['ielts_desc'])
            item["ielts"] = ielts_d.get('IELTS')
            item["ielts_l"] = ielts_d.get('IELTS_L')
            item["ielts_s"] = ielts_d.get('IELTS_S')
            item["ielts_r"] = ielts_d.get('IELTS_R')
            item["ielts_w"] = ielts_d.get('IELTS_W')
            print(
                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            department = response.xpath(
                "//div[@class='field field-dddd-view-modeluts-course-course__part-of field-type-ds field-label-hidden']//div[@class='field-item']//p/a/text()"
            ).extract()
            clear_space(department)
            department = ''.join(department).replace("UTS:", "").strip()
            item['department'] = department
            print("item['department']: ", item['department'])

            apply_procces = response.xpath(
                "//h4[contains(text(),'International students')]/..").extract(
                )
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(apply_procces))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            yield item
        except Exception as e:
            with open("scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #22

Показать файл

Файл: SouthernCrossUniversity_U.py Проект: histudent/python_spider

    def content(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "Southern Cross University"
        # item['country'] = 'Australia'
        # item['website'] = 'https://www.scu.edu.au/'
        item['url'] = response.url
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            programme = response.xpath(
                "//h1[@class='pageTitleFixSource']//text()").extract()
            clear_space(programme)
            item['degree_name'] = ''.join(programme)
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Bachelor", item['degree_name'])
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2:
                duration = response.xpath(
                    "//div[@id='international']//td[contains(text(),'Duration')]/following-sibling::td//text()"
                ).extract()
                clear_space(duration)
                item['duration'] = ''.join(duration).strip()
                print("item['duration']: ", item['duration'])

                if "full" in item['duration'].lower():
                    programme_re = re.findall(
                        r"\(.+\)", item['degree_name'].replace(
                            "(Honours)", "").replace("with Honours", ""))
                    if len(programme_re) > 0:
                        if len(programme_re) != "(Honours)":
                            item['programme_en'] = ''.join(
                                programme_re).replace("(",
                                                      "").replace(")",
                                                                  "").strip()
                        else:
                            item['programme_en'] = item['degree_name'].replace(
                                "Bachelor of", "").replace("(Honours)",
                                                           "").strip()
                    else:
                        in_re = re.findall(
                            r"in\s.*", item['degree_name'].replace(
                                "(Honours)", "").replace("with Honours", ""))
                        if len(in_re) > 0:
                            item['programme_en'] = ''.join(
                                in_re).strip().strip("in").strip()
                        else:
                            item['programme_en'] = item['degree_name'].replace(
                                "Bachelor of", "").replace("with Honours",
                                                           "").strip()
                    print("item['programme_en']: ", item['programme_en'])

                    overview = response.xpath(
                        "//div[@class='summary']").extract()
                    item['degree_overview_en'] = remove_class(
                        clear_lianxu_space(overview))
                    item['overview_en'] = item['degree_overview_en']
                    # if item['degree_overview_en'] == "":
                    #     print("***degree_overview_en 为空")
                    # print("item['degree_overview_en']: ", item['degree_overview_en'])

                    career = response.xpath(
                        "//h3[contains(text(), 'Career opportunities')]/.."
                    ).extract()
                    item['career_en'] = remove_class(
                        clear_lianxu_space(career))
                    # if item['career_en'] == "":
                    #     print("***career_en 为空")
                    # print("item['career_en']: ", item['career_en'])

                    start_date = response.xpath(
                        "//h3[contains(text(),'International students studying in Australia')]/..//div[@class='accordion course-apply-accordion']//div/h5/span//text()"
                    ).extract()
                    print("start_date: ", start_date)
                    if start_date:
                        item['start_date'] = ','.join(start_date).strip()
                    print("item['start_date']: ", item['start_date'])

                    tuition_fee = response.xpath(
                        "//div[@id='international']//div[@class='table-grid table-responsive no-overflow']//tbody/tr/td[3]//text()"
                    ).extract()
                    clear_space(tuition_fee)
                    item['tuition_fee'] = '; '.join(tuition_fee).strip()
                    print("item['tuition_fee']: ", item['tuition_fee'])

                    # //tr[@class='data-label-Overall']/td[2]
                    IELTS = response.xpath(
                        "//tr[@class='data-label-Overall']/td[2]//text()|//tr[@class='data-label-Overall Score']/td[2]//text()|"
                        "//td[contains(text(),'Overall Score')]/following-sibling::td//text()"
                    ).extract()
                    clear_space(IELTS)
                    item['ielts'] = ','.join(IELTS).strip()
                    print("item['ielts']: ", item['ielts'])

                    IELTS_L = response.xpath(
                        "//tr[@class='data-label-Listening']/td[2]//text()"
                    ).extract()
                    clear_space(IELTS_L)
                    item['ielts_l'] = ','.join(IELTS_L).strip()
                    # print("item['ielts_l']: ", item['ielts_l'])

                    IELTS_S = response.xpath(
                        "//tr[@class='data-label-Speaking']/td[2]//text()"
                    ).extract()
                    clear_space(IELTS_S)
                    item['ielts_s'] = ','.join(IELTS_S).strip()
                    # print("item['ielts_s']: ", item['ielts_s'])

                    IELTS_R = response.xpath(
                        "//tr[@class='data-label-Reading']/td[2]//text()"
                    ).extract()
                    clear_space(IELTS_R)
                    item['ielts_r'] = ','.join(IELTS_R).strip()
                    # print("item['ielts_r']: ", item['ielts_r'])

                    IELTS_W = response.xpath(
                        "//tr[@class='data-label-Writing']/td[2]//text()"
                    ).extract()
                    clear_space(tuition_fee)
                    item['ielts_w'] = ','.join(IELTS_W).strip()
                    # print("item['ielts_w']: ", item['ielts_w'])

                    average_score = response.xpath(
                        "//tr[@class='data-label-China Senior Middle 3']//text() | //tr[@class='data-label-China Gao Kao']//text()"
                    ).extract()
                    clear_space(average_score)
                    # item['average_score'] = ','.join(average_score).strip()
                    # print("item['average_score']: ", item['average_score'])

                    modules = response.xpath(
                        "//div[@id='structure']").extract()
                    item['modules_en'] = remove_class(
                        clear_lianxu_space(modules))
                    # print("item['modules_en']: ", item['modules_en'])

                    # //h2[contains(text(),'Admission requirements')]|//h2[contains(text(),'Admission requirements')]/following-sibling::div[1]
                    rntry_requirements_en = response.xpath(
                        "//h2[contains(text(),'Admission requirements')]|//h2[contains(text(),'Admission requirements')]/following-sibling::div[1]"
                    ).extract()
                    item['rntry_requirements_en'] = remove_class(
                        clear_lianxu_space(rntry_requirements_en))
                    # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                    how_to_apply = response.xpath(
                        "//div[@id='apply']").extract()
                    item['apply_desc_en'] = remove_class(
                        clear_lianxu_space(how_to_apply))
                    # print("item['apply_desc_en']: ", item['apply_desc_en'])

                    other = response.xpath(
                        "//div[@id='international']//text()").extract()
                    clear_space(other)
                    # item['other'] = ''.join(other).strip()
                    # print("item['other']: ", item['other'])

                    location = response.xpath(
                        "//div[@id='international']//td[contains(text(),'Availability details')]/following-sibling::td//tbody/tr[position()<last()]/td[1]//text()"
                    ).extract()
                    clear_space(location)
                    item['location'] = ', '.join(location).strip()
                    print("item['location']: ", item['location'])

                    if item['location'] != "SCU Online":
                        major_list = response.xpath(
                            "//h3[contains(text(),'Specialisations')]/../../following-sibling::tr[@class='header-row text group-hdr']//h4//text()"
                        ).extract()
                        clear_space(major_list)
                        print("major_list: ", major_list)
                        print(len(major_list))

                        if len(major_list) == 0:
                            yield item
                        else:
                            modules_list = response.xpath(
                                "//h3[contains(text(),'Specialisations')]/../../following-sibling::tr[@class='header-row text group-hdr']//h4/following-sibling::div"
                            ).extract()
                            print("===", modules_list)
                            print(len(modules_list))
                            if len(modules_list) == len(major_list):
                                for m in range(len(major_list)):
                                    item['programme_en'] = major_list[m]
                                    item['modules_en'] = remove_class(
                                        clear_lianxu_space([modules_list[m]]))
                                    print("item['programme_en']: ",
                                          item['programme_en'])
                                    yield item
        except Exception as e:
            with open("scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #23

Показать файл

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "University of Canberra"
        # item['country'] = 'Australia'
        # item['website'] = 'https://www.vu.edu.au/'
        item['url'] = response.url
        item['degree_type'] = 1
        print("===========================")
        print(response.url)

        # item['major_type1'] = programme_dict.get(response.url)
        # print("item['major_type1']: ", item['major_type1'])
        try:
            # //h1[@class='page-header']
            programme = response.xpath(
                "//h1[@class='course_title']//text()").extract()
            clear_space(programme)
            degree_name_str = ''.join(programme).strip()
            degree_name_re = re.findall(r"-.*", degree_name_str)
            item['degree_name'] = degree_name_str.replace(
                ''.join(degree_name_re), '').strip()
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Bachelor", item['degree_name'])
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2 and "online" not in item['degree_name'].lower():
                programme_re = re.findall(r"\(.+\)", item['degree_name'])
                if len(programme_re) > 0:
                    item['programme_en'] = ''.join(programme_re).replace(
                        "(", "").replace(")", "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace(
                        "Bachelor of", "").strip()
                print("item['programme_en']: ", item['programme_en'])

                location = response.xpath(
                    "//th[contains(text(),'Location:')]/following-sibling::td//text()"
                ).extract()
                clear_space(location)
                item['location'] = ''.join(location).strip()
                # print("item['location']: ", item['location'])

                department = response.xpath(
                    "//th[contains(text(),'Faculty:')]/following-sibling::td//text()"
                ).extract()
                clear_space(department)
                item['department'] = ''.join(department).strip()
                # print("item['department']: ", item['department'])

                ielts_desc_re = response.xpath(
                    "//th[contains(text(),'English Language Requirements:')]/following-sibling::td//text()"
                ).extract()
                item['ielts_desc'] = ''.join(ielts_desc_re).strip()
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ieltlsrw = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
                # print(ieltlsrw)
                if len(ieltlsrw) == 1:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[0]
                    item["ielts_s"] = ieltlsrw[0]
                    item["ielts_r"] = ieltlsrw[0]
                    item["ielts_w"] = ieltlsrw[0]
                elif len(ieltlsrw) == 2:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[1]
                    item["ielts_s"] = ieltlsrw[1]
                    item["ielts_r"] = ieltlsrw[1]
                    item["ielts_w"] = ieltlsrw[1]
                elif len(ieltlsrw) == 5:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[1]
                    item["ielts_s"] = ieltlsrw[4]
                    item["ielts_r"] = ieltlsrw[2]
                    item["ielts_w"] = ieltlsrw[3]
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                tuition_fee = response.xpath(
                    "//div[@id='fees']//tr[2]/td[3]//text()").extract()
                clear_space(tuition_fee)
                # print("tuition_fee: ", tuition_fee)
                tuition_fee_str = ''.join(tuition_fee).strip()
                tuition_fee_re = re.findall(r"\d+,\d+", tuition_fee_str)
                item['tuition_fee'] = ''.join(tuition_fee_re).replace(
                    ",", "").strip()
                # print("item['tuition_fee']: ", item['tuition_fee'])

                overview = response.xpath(
                    "//h2[contains(text(),'Career opportunities')]/preceding-sibling::*"
                ).extract()
                if len(overview) == 0:
                    overview = response.xpath(
                        "//div[@class='collapsible-section']/preceding-sibling::*"
                    ).extract()
                item['overview_en'] = item[
                    'degree_overview_en'] = remove_class(
                        clear_lianxu_space(overview))
                # print("item['degree_overview_en']: ", item['degree_overview_en'])

                career = response.xpath(
                    "//h2[contains(text(),'Career opportunities')]|//h2[contains(text(),'Career opportunities')]/following-sibling::*[1]|"
                    "//strong[contains(text(),'Career opportunities')]/..|//strong[contains(text(),'Career opportunities')]/../following-sibling::*[position()<3]"
                ).extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # if item['career_en'] == "":
                #     print("***career_en 为空")
                # print("item['career_en']: ", item['career_en'])

                modules = response.xpath(
                    "//h2[contains(text(),'Course Requirements')]|//div[@id='toggle-view']"
                ).extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # if item['modules_en'] == "":
                #     print("***modules_en 为空")
                # print("item['modules_en']: ", item['modules_en'])

                entry_requirements = response.xpath(
                    "//div[@id='admission']").extract()
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space(entry_requirements))
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                # how_to_apply = response.xpath(
                #     "//div[@id='apply-now']").extract()
                # item['apply_desc_en'] = remove_class(clear_lianxu_space(how_to_apply))
                # # print("item['apply_desc_en']: ", item['apply_desc_en'])

                yield item
        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #24

Показать файл

Файл: RMITUniversity_U.py Проект: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "RMIT University"
        # item['country'] = 'Australia'
        # item['website'] = 'https://www.rmit.edu.au'
        item['url'] = response.url
        item['degree_type'] = 1
        item['major_type1'] = response.meta.get(response.url)
        print("===========================")
        print(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            programme = response.xpath(
                "//h1[@id='course-name']//text()|//h1[@class='highLight program-header']//text()"
            ).extract()
            clear_space(programme)
            item['degree_name'] = ''.join(programme).strip()
            if item['degree_name'] == "":
                print("***degree_name为空")
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Bachelor", item['degree_name'])
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2:
                programme_re = re.findall(
                    r"\(.+\)", item['degree_name'].replace("(Honours)", ""))
                if len(programme_re) > 0:
                    item['programme_en'] = ''.join(programme_re).replace(
                        "(", "").replace(")", "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace(
                        "Bachelor of", "").strip()
                print("item['programme_en']: ", item['programme_en'])

                location = response.xpath(
                    "//span[@class='icon-location']/..//text()|"
                    "//h4[@class='description'][contains(text(),'Location')]/following-sibling::*//text()"
                ).extract()
                clear_space(location)
                item['location'] = ' '.join(location).strip()
                if item['location'] == "":
                    print("***location为空")
                print("item['location']: ", item['location'])

                duration = response.xpath(
                    "//div[@class='b-program-content links b-international']//span[@class='icon-clock']/..//text()|"
                    "//div[@class='b-program-content links b-international  ']//span[@class='icon-clock']/..//text()|"
                    "//div[contains(@class,'box b-international not-hide col-xs-12')]//h4[@class='description'][contains(text(),'Duration')]/following-sibling::*//text()"
                ).extract()
                clear_space(duration)
                item['duration'] = ''.join(duration).strip()
                # if item['duration'] == "":
                #     print("***duration为空")
                # print("item['duration']: ", item['duration'])

                tuition_fee = response.xpath(
                    "//div[contains(@class,'b-program-content links b-international')]//span[@class='icon-fees']/..//text()|"
                    "//div[contains(@class,'b-program-content links b-international  ')]//span[@class='icon-fees']/..//text()|"
                    "//div[contains(@class,'box b-international not-hide col-xs-12')]//h4[@class='description'][contains(text(),'Fees')]/following-sibling::*//text()"
                ).extract()
                clear_space(tuition_fee)
                tuition_fee = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee'] = tuition_fee
                if item['tuition_fee'] == 0:
                    item['tuition_fee'] = None
                # print("item['tuition_fee']: ", item['tuition_fee'])

                start_date = response.xpath(
                    "//div[@class='b-program-content links b-international']//span[@class='icon-intake']/..//text()|"
                    "//div[@class='b-program-content links b-international  ']//span[@class='icon-intake']/..//text()|"
                    "//div[contains(@class,'box b-international not-hide col-xs-12')]//h4[@class='description'][contains(text(),'Next intake')]/following-sibling::*//text()|"
                    "//div[contains(@class,'box b-international not-hide col-xs-12')]//h4[@class='description'][contains(text(),'Next Intake')]/following-sibling::*//text()"
                ).extract()
                clear_space(start_date)
                item['start_date'] = getStartDateMonth(' '.join(start_date))
                if item['start_date'] == "":
                    print("***start_date 为空")
                print("item['start_date']: ", item['start_date'])

                overview = response.xpath(
                    "//div[@id='overview']/..|//div[@id='overview']/../following-sibling::div[1]|"
                    "//div[@id='Overview']/..|//div[@id='Overview']/../following-sibling::div[1]"
                ).extract()
                item['degree_overview_en'] = remove_class(
                    clear_lianxu_space(overview))

                modules_en_url = response.xpath(
                    "//table[@class='table  program-table']//td//a[contains(text(),'View plan')]/@href"
                ).extract()
                clear_space(modules_en_url)
                if len(modules_en_url) > 0:
                    url = "https://www.rmit.edu.au" + modules_en_url[0]
                    self.parse_modules1(url, item)
                else:
                    modules_en = response.xpath(
                        "//span[contains(text(),'Electives and program structure')]/../../../.."
                    ).extract()
                    item['modules_en'] = remove_class(
                        clear_lianxu_space(modules_en))

                if item['degree_overview_en'] == "":
                    overviewModulesUrl = response.url + "/program-details"
                    self.parse_overviewModules1(overviewModulesUrl, item)

                if item['degree_overview_en'] == "":
                    print("***degree_overview_en 为空")
                print("item['degree_overview_en']: ",
                      item['degree_overview_en'])
                if item['modules_en'] == "":
                    print("***modules_en 为空")
                print("item['modules_en']: ", item['modules_en'])

                career = response.xpath(
                    "//div[@id='career']|//div[@id='career']/../following-sibling::div[1]|"
                    "//div[@id=' career']|//div[@id=' career']/../following-sibling::div[1]|"
                    "//div[@id='Career']|//div[@id='Career']/../following-sibling::div[1]|"
                    "//div[@id=' Career']|//div[@id=' Career']/../following-sibling::div[1]"
                ).extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                if item['career_en'] == "":
                    careerUrl = response.url + "/career"
                    self.parse_career1(careerUrl, item)
                if item['career_en'] == "":
                    print("***career_en 为空")
                print("item['career_en']: ", item['career_en'])

                rntry_requirements_en = response.xpath(
                    "//div[@id='admissions']/..|//div[@id='admissions']/../following-sibling::*[position()<last()-3]|"
                    "//div[@id='Admissions']/..|//div[@id='Admissions']/../following-sibling::*[position()<last()-3]"
                ).extract()
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space(rntry_requirements_en))
                if item['rntry_requirements_en'] == "":
                    entryUrl = response.url + "/entry-requirements"
                    self.parse_entryrequirements1(entryUrl, item)
                if item['rntry_requirements_en'] == "":
                    print("***rntry_requirements_en 为空")
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                ielts_desc = response.xpath(
                    "//li[contains(text(),'IELTS (Academic): ')]//text()"
                ).extract()
                item['ielts_desc'] += clear_lianxu_space(ielts_desc)
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ielts_d = get_ielts(item['ielts_desc'])
                item["ielts"] = ielts_d.get('IELTS')
                item["ielts_l"] = ielts_d.get('IELTS_L')
                item["ielts_s"] = ielts_d.get('IELTS_S')
                item["ielts_r"] = ielts_d.get('IELTS_R')
                item["ielts_w"] = ielts_d.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                toefl_desc = response.xpath(
                    "//*[contains(text(),'TOEFL (Internet Based Test - IBT): ')]//text()"
                ).extract()
                item['toefl_desc'] += clear_lianxu_space(toefl_desc)
                # print("item['toefl_desc']: ", item['toefl_desc'])

                ielts_d = get_toefl(item['toefl_desc'])
                item["toefl"] = ielts_d.get('TOEFL')
                item["toefl_l"] = ielts_d.get('TOEFL_L')
                item["toefl_s"] = ielts_d.get('TOEFL_S')
                item["toefl_r"] = ielts_d.get('TOEFL_R')
                item["toefl_w"] = ielts_d.get('TOEFL_W')
                # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
                #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))
                # programme = response.xpath("//div[@class='program-name']/h1/text()").extract()
                # ucascode = response.xpath("//html//div[@class='c-summary c-summary-2-col mb-lg-lg-lg clearfix']/div[1]/span[2]/text()").extract()
                # clear_space(ucascode)
                # # item['ucas_code'] = ''.join(ucascode)
                # # print("item['ucas_code']2: ", item['ucas_code'])
                #
                # duration = response.xpath(
                #     "//div[@data-duration][2]/span[2]/text()").extract()
                # clear_space(duration)
                # item['duration'] = ''.join(duration)
                # print("item['duration']2: ", item['duration'])
                #
                # start_date = response.xpath(
                #     "//div[@data-intake][2]/span[2]/text()").extract()
                # clear_space(start_date)
                # item['start_date'] = ''.join(start_date)
                # print("item['start_date']2: ", item['start_date'])
                #
                # location = response.xpath(
                #     "//div[@class='c-summary-cell not-hide']/span[2]//text()").extract()
                # clear_space(location)
                # item['location'] = ''.join(location)
                # print("item['location']2: ", item['location'])
                #
                # department = response.xpath(
                #     "//html//div[@class='c-summary c-summary-2-col mb-lg-lg-lg clearfix']/div[7]/span[2]/text()").extract()
                # clear_space(department)
                # item['department'] = ''.join(department)
                # print("item['department']2: ", item['department'])
                #
                # overview = response.xpath(
                #     "//html//div[@class='program-summary-section-overview mb-md-md-md']/div[position()<last()-1]").extract()
                # item['degree_overview_en'] = remove_class(clear_lianxu_space(overview))
                # print("item['degree_overview_en']2: ", item['degree_overview_en'])
                #
                #
                # # //html//div[@class='panel-group accordion']/div/div[4]
                # career = response.xpath(
                #     "//html//div[@class='panel-group accordion']/div/div[@class='panel panel-default Yes'][3]").extract()
                # if "Career outlook" not in career:
                #     career = response.xpath(
                #     "//html//div[@class='panel-group accordion']/div/div[@class='panel panel-default Yes'][4]").extract()
                # item['career_en'] = remove_class(clear_lianxu_space(career))
                # print("item['career_en']2: ", item['career_en'])
                #
                # modulesUrl = response.url + "/program-structure"
                # self.parse_modules2(modulesUrl, item)
                #
                # how_to_applyUrl = response.url + "/how-to-apply"
                # self.parse_how_to_apply2(how_to_applyUrl, item)
                #
                # entryUrl = response.url + "/entry-requirements"
                # self.entryrequirements2(entryUrl, item)
                #
                # feeUrl = response.url + "/fees"
                # self.fees2(feeUrl, item)

                item['apply_proces_en'] = remove_class(
                    clear_lianxu_space([
                        """<div class="share-heading hide">How to Apply</div>
  </div>
                </div>
			<div class="standard-content-article mb-lg-md-md clearfix">
				<div class="org-area-module-detail-view accordian ">
					<div class="row">
						<div class="col-xs-12 ">
							<div class="clearfix">
  <p class="lead">A step-by-step guide for international students on how to apply to study at RMIT.</p>
  <div class="lower-image-container"></div>
							</div>
							<!-- Parsys -->
							<!-- This Parsys will be used to Put all Main Body Components -->
<div class="floated-image-container pull-right">
<div class="detail-img-list not-hide image-square">
	<figure>
		<div class="c-detail-image c-detail-image-square">
			<img data-ri-xxs="/content/dam/rmit/rmit-images/life-at-rmit/Study-modes_EVE-800x800.jpg.transform/rendition-800x800/image.jpg" data-ri-sm="/content/dam/rmit/rmit-images/life-at-rmit/Study-modes_EVE-800x800.jpg.transform/rendition-640x640/image.jpg" class="c-responsive-image bg-cover offset-content">
		</div>
		<div class="c-detail-image c-detail-image-portrait">
			<img data-ri-xxs="/content/dam/rmit/rmit-images/life-at-rmit/Study-modes_EVE-800x800.jpg.transform/rendition-800x1068/image.jpg" data-ri-sm="/content/dam/rmit/rmit-images/life-at-rmit/Study-modes_EVE-800x800.jpg.transform/rendition-640x854/image.jpg" class="c-responsive-image bg-cover offset-content">
		</div>
	</figure>
</div>
</div>
<div>
    <div class="extended-desc not-hidden">
        <p>If you want to study for only one or two semesters, you can apply for a&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/programs-for-international-students/study-abroad-and-exchange/study-abroad.html">study abroad program</a>&nbsp;or&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/programs-for-international-students/study-abroad-and-exchange/student-exchange.html">student exchange</a>&nbsp;at RMIT.</p>
<h3>Applying for a research degree?</h3>
<p>If you want to apply for a research program, <a href="/content/rmit-ui/en/research/phds-and-other-research-degrees/how-to-apply.html">follow this process and apply here</a> instead.<br>
</p>
<h2>Step 1: Find a program</h2>
<p>Search for a program in your&nbsp;<a href="/content/rmit-ui/en/study-with-us.html">interest area</a>&nbsp;or browse by&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/programs-for-international-students.html">level of study</a>. Some programs are not available in the July intake, in which case, you will need to apply for the next available intake.</p>
<p>You can also use the&nbsp;<a href="https://www.international.rmit.edu.au/info/programfees.asp" title="Programs, intakes and tuition fees database">Programs, intakes and tuition fees database</a>&nbsp;to search for programs.</p>
<h2>Step 2: Check the entry requirements</h2>
<p>Check that you qualify for the program's entry requirements including:</p>
<ul>
<li>English language requirements</li>
<li>academic entry requirement (see equivalent&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/apply-to-rmit-international-students/entry-requirements/country-equivalency.html">entry requirements by country</a>)</li>
<li>pre-requisites</li>
<li>selection tasks.</li>
</ul>
<p>If you don’t meet the entry requirements for your preferred program, you can consider a range of programs that may provide&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/apply-to-rmit-international-students/pathways-and-credit-transfer.html">pathways</a>&nbsp;to your preferred program.</p>
<p>If you are&nbsp;currently studying an Australian Year 12 (in Australia or overseas) or International Baccalaureate (in Australia or New Zealand) and applying &nbsp;for a&nbsp;Bachelor, Associate or Honours degree, you will need to apply via VTAC. You should <a href="http://www.rmit.edu.au/study-with-us/international-students/apply-to-rmit-international-students/how-to-apply/international-students-studying-vce-or-ib">check the VTAC entry requirements</a>.<br>
</p>
<h2>Step 3: Collect required documents</h2>
<p>To avoid delays in admission processing, submit&nbsp;a&nbsp;complete set of supporting documents&nbsp;including:</p>
<ul>
<li>passport</li>
<li>certified copies of academic transcripts&nbsp;&nbsp;(not required for current RMIT students applying to another RMIT program)</li>
<li>certified copies of all graduation certificates in both the original&nbsp;language and English&nbsp;&nbsp;(not required for current RMIT students applying to another RMIT program)</li>
<li>evidence of English language proficiency&nbsp;&nbsp;(not required for current RMIT students applying to another RMIT program)</li>
<li>any documentation relating to selection tasks (pre-selection kits,&nbsp;folios etc.)</li>
<li>CV, work reference letter, referee report etc if applicable</li>
</ul>
<p>&nbsp;Please note that documents submitted will not be returned.</p>
<h2>Step 4: Submit your application</h2>
<p>Submit your&nbsp;application online&nbsp;with&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/apply-to-rmit-international-students/how-to-apply/documentation-required.html">all the required documents</a>.</p>
<h4>Students completing an Australian Year 12 (in Australia or overseas), or the International Baccalaureate (in Australia or New Zealand)</h4>
<ul>
<li>Apply for <strong>Higher Education</strong> programs (Bachelor, Associate Degree and Honours) through the Victorian Tertiary Admissions Centre (VTAC).<br>
<br>
<a href="http://www.vtac.edu.au/applying.html">Apply now via VTAC</a></li>
<li>Apply for <strong>Vocational Education</strong> programs (Foundation Studies, ELICOS, VCE, Certificate IV, Diploma and Advanced Diplomas) via iApply, the online application system for international students.<br>
<br>
<a href="https://iapply.rmit.edu.au/sitsvision/wrd/SIW_LGN">Apply now via iApply</a></li>
</ul>
<h4>Studying fully online<br>
</h4>
<ul>
<li>If your program is delivered fully online, use the online application system for local students and follow the local student application process. Note: fully online programs do not qualify for an Australian Student Visa.<br>
<br>
<a href="https://rmit.service-now.com/rmit-admissions/">Apply now via Admissions</a></li>
</ul>
<h4>All other international students<br>
</h4>
<ul>
<li>If you are applying for on-campus study in a coursework program use iApply, the online application system for international students.<br>
<br>
<a href="https://iapply.rmit.edu.au/sitsvision/wrd/SIW_LGN">Apply now via iApply</a></li>
</ul>
<h4>Application fee</h4>
<p>You will need to pay an application fee if you are from one of these&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/apply-to-rmit-international-students/how-to-apply/application-fee.html">countries classified as high risk</a>.</p>
<h2>Need help?</h2>
<p>If you need assistance,&nbsp;<a href="https://connect.prospectivestudent.info/RMITInt?_ga=1.241036611.1742672422.1416265787">contact us</a>&nbsp;or one of&nbsp;<a href="https://www.international.rmit.edu.au/info/agentlist/">RMIT’s appointed representatives</a>&nbsp;(agents).</p>
<h2>Next steps:</h2>
<p>Your application will be assessed in line with RMIT’s policies and procedures. If you are successful, you will receive an offer letter. You can then&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/apply-to-rmit-international-students/accept-your-offer.html">accept your offer</a>&nbsp;by following the instructions in your offer letter.&nbsp;</p>
<p>RMIT will normally advise you on the outcome of your application within 10 business days. If you are applying from Australia you should hear within 24 hours. If you don't hear back within the time frame above please <a href="https://rmit.au1.qualtrics.com/jfe/form/SV_0fbt3k9dEkNATZ3">contact Admissions Helpdesk</a>.</p>
<p>If you are applying via VTAC <a href="http://www.vtac.edu.au/dates.html">check the VTAC website</a> for important dates.</p>
    </div>
</div>
						</div>
					</div>"""
                    ]))
                item['apply_documents_en'] = remove_class(
                    clear_lianxu_space([
                        """<p>To avoid delays in admission processing, submit&nbsp;a&nbsp;complete set of supporting documents&nbsp;including:</p>
<ul>
<li>passport</li>
<li>certified copies of academic transcripts&nbsp;&nbsp;(not required for current RMIT students applying to another RMIT program)</li>
<li>certified copies of all graduation certificates in both the original&nbsp;language and English&nbsp;&nbsp;(not required for current RMIT students applying to another RMIT program)</li>
<li>evidence of English language proficiency&nbsp;&nbsp;(not required for current RMIT students applying to another RMIT program)</li>
<li>any documentation relating to selection tasks (pre-selection kits,&nbsp;folios etc.)</li>
<li>CV, work reference letter, referee report etc if applicable</li>
</ul>
<p>&nbsp;Please note that documents submitted will not be returned.</p>"""
                    ]))
                yield item
        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #25

Показать файл

Файл: TheUniversityOfMelbourne_U.py Проект: histudent/python_spider

    def parse_programme_message(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "The University of Melbourne"
        # item['country'] = 'Australia'
        # item['website'] = 'http://www.unimelb.edu.au/'
        item['url'] = response.url
        item['degree_type'] = 1
        print("=========================")
        print(response.url)
        try:
            programme = response.xpath(
                "//div[@class='headline']/h1/text()|//h1[@id='page-header']//text()"
            ).extract()
            clear_space(programme)
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            # //div[@class='description']
            overview_en = response.xpath(
                "//h3[contains(text(),'Careers')]/preceding-sibling::*|"
                "//section[@id='course-overview']//div[@class='course-section__main course-section__main-with-aside']"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            print("item['overview_en']: ", item['overview_en'])

            career_en = response.xpath(
                "//h3[contains(text(),'Careers')]/preceding-sibling::*[1]/following-sibling::*"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en']: ", item['career_en'])

            modules = response.xpath(
                "//div[@class='description']/following-sibling::*").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])
            modulesUrl = response.url + "what-will-i-study/"
            # print(modulesUrl)
            item['modules_en'] = self.parse_modules(modulesUrl)
            print("item['modules_en']: ", item['modules_en'])

            # https://futurestudents.unimelb.edu.au/admissions/entry-requirements/language-requirements/undergraduate-toefl-ielts
            item[
                'ielts_desc'] = "an overall band score of 6.5 or more in the Academic International English Language Testing System (IELTS), with no bands less than 6.0"
            item["ielts"] = '6.5'
            item["ielts_l"] = '6.0'
            item["ielts_s"] = '6.0'
            item["ielts_r"] = '6.0'
            item["ielts_w"] = '6.0'

            item['toefl_code'] = '0974'
            item["toefl"] = '79'
            item["toefl_l"] = '13'
            item["toefl_s"] = '18'
            item["toefl_r"] = '13'
            item["toefl_w"] = '21'

            # https://futurestudents.unimelb.edu.au/admissions/applications/ug-int
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div class='main page-body' id='main-content' role='main'>
            <h1>International undergraduate</h1>
<div id="content_div_597198">
<p>Starting at university can be daunting, but applying for a place shouldn't be. Here's a guide to help you through the application process at Melbourne.</p><div class="col-1 first step-arrow"><h3>Step 1</h3></div><div class="col-5"><h2>Before you apply</h2><p>The first step is to figure out which course you want to study, and if you meet all of that course's entry requirements. At this stage, you should:</p><ul><li>Find <a href="http://coursesearch.unimelb.edu.au" target="_blank">the right course</a> for you, and make sure you meet the <a href="https://futurestudents.unimelb.edu.au/admissions/entry-requirements/undergraduate-international">entry requirements</a></li><li>If you're unsure, you may need to <a href="https://futurestudents.unimelb.edu.au/start-here">check if you're an international student</a></li><li>Check that you meet the <a href="https://futurestudents.unimelb.edu.au/admissions/entry-requirements/language-requirements">English language requirements</a></li><li>Make sure you're eligible for the appropriate <a href="http://www.services.unimelb.edu.au/international/visas/" target="_blank">visa to study in Australia</a></li><li>Find out what the <a href="https://futurestudents.unimelb.edu.au/admissions/fees/ug-intl">fees</a> for your course are and what <a href="https://scholarships.unimelb.edu.au">scholarships</a> are available to help you</li><li>Check to see if you're eligible for any of our <a href="https://futurestudents.unimelb.edu.au/admissions/high_achievers_programs">High Achievers' programs</a></li><li>Review our <a href="https://futurestudents.unimelb.edu.au/admissions/admission_with_credit">Admission with credit pages</a> if you are currently studying at another university or have completed post-secondary studies.</li><li>Look at the <a href="http://services.unimelb.edu.au/finaid/planning/cost_of_living" target="_blank">cost of living in Melbourne</a>, and what <a href="http://services.unimelb.edu.au/finaid" target="_blank">financial assistance</a> is available to you while you study.</li></ul><p>If you'd prefer to speak to someone in person about your application, contact a <a href="https://futurestudents.unimelb.edu.au/info/overseas-representatives">University of Melbourne representative</a> in your country.</p></div><hr /><div class="col-1 first step-arrow"><h3>Step 2</h3></div><div class="col-5"><h2>How to apply</h2><h3>Studying an Australian or NZ Year 12</h3><p>If you are studying any Australian or NZ year 12 program (including WACE/AUSMAT, SACE/SAM or NCEA) whether in Australia or an another country you should apply through the Victorian Tertiary Admissions Centre (VTAC). Full details about the VTAC application process can be found on the <a href="http://www.vtac.edu.au" target="_blank">VTAC website</a>. Through VTAC, you can list up to 8 course preferences. You should list courses in your order of preference with the course of greatest interest listed first. Be sure that your application is finalised by the <a href="http://www.vtac.edu.au/dates.html" target="_blank">VTAC due dates</a>.</p><p>If you have previously applied to the University of Melbourne via VTAC but were not offered a place or were offered but never enrolled, you should apply as a new student.</p><h3>Current University of Melbourne international students</h3><p>If you're an international student currently studying at the University of Melbourne and you wish to transfer to another University of Melbourne course, you must submit your transfer application via the <a href="http://www.vtac.edu.au/" target="_blank">Victorian Tertiary Admissions Centre (VTAC)</a>. Be sure that your application is finalised by the <a href="http://www.vtac.edu.au/dates.html" target="_blank">VTAC due dates</a>.</p><h3>All other international students</h3><p>You can <a href="https://futurestudents.unimelb.edu.au/admissions/applications/online-application-info">apply online</a> using our e-application.</p><p>Please ensure your application is received by us before the <a href="https://futurestudents.unimelb.edu.au/admissions/dates">relevant deadline</a>. An application fee of AUD$100 applies. This fee is non-refundable.</p><p>Alternatively, if you would prefer to apply in person in your own country, we have a number of overseas representatives in a variety of countries. Search our list of <a href="https://futurestudents.unimelb.edu.au/info/overseas-representatives">overseas representatives</a> to find one near you.</p><p>If you have accepted an offer from another institution in Australia, been granted a Confirmation of Enrolment (COE) and want to transfer to the University of Melbourne within the first six months of study you will need a letter of release from that institution. Please see our <a href="https://futurestudents.unimelb.edu.au/admissions/applications/other-applications/transferring-course/international_student_transfer_policy">International Student Transfer Policy for more details</a>.</p><p><strong>International Baccalaureate and US Advanced Placement students</strong></p><p>You can choose to have your IB or AP results sent directly to the University as soon as they are released. Please make sure you advise us in your application if you have authorised the release of your results.</p><p>University of Melbourne AP institution code: 9015<br /> University of Melbourne IB institution code: 002406</p><h3>Study abroad or exchange</h3><p>If you're interested in studying at Melbourne for a shorter period - one or two semesters - please refer to the <a href="http://www.mobility.unimelb.edu.au/inbound/index.html">Melbourne Global Mobility</a> site.</p></div><hr /><div class="col-1 first step-arrow"><h3><a name="accept" id="accept"></a>Step 3</h3></div><div class="col-5"><h2><a name="accept" id="accept"></a>After you apply - accepting your offer</h2><h3>Acknowledgement</h3><p>When you submit your e-application you will be automatically sent an acknowledgement email.&nbsp;&nbsp;The acknowledgement letter will include your unique student ID and application reference number. Please quote these numbers in all correspondence with the University.</p><p>We will begin by checking that your application contains everything we need to begin assessment.&nbsp;&nbsp;If anything is missing we will email you.</p><p>If your application is complete and we do not require any further information then your application will be assessed. This takes approximately two to four weeks for undergraduate courses.</p><h3>Offer process</h3><p>If your application is successful, your offer letter will be emailed directly to you (and copied to your nominated authorised representative, unless you have applied through VTAC). If the offer is conditional, then you need to meet the conditions of your offer before accepting the offer. If you have been sent an unconditional offer, you can choose to accept it immediately.</p><p>To accept your offer follow the instructions in your offer letter.</p><p>If you choose not to accept the offer right away, you can also:</p><ul><li>Consider <a href="http://students.unimelb.edu.au/get-started">deferring your offer</a></li><li>Ask to be considered for a different course than the one you originally applied for: <ul><li><strong>If you applied through VTAC,</strong> you may be able to <a href="http://www.cop.unimelb.edu.au">change your preferences</a>.</li><li>If you applied directly using our e-application, you can login to your user account and change your preference order and/or submit a new application.</li><li><strong>If you applied directly not using our e-application,</strong> you will need to submit your change of preference via email to International Admissions.</li></ul></li><li><a href="https://futurestudents.unimelb.edu.au/admissions/applications/non-acceptance" target="_blank">Decline your offer</a></li></ul><p>Unsuccessful applicants will receive a letter by mail (or fax to your nominated authorised representative) explaining why the application has been unsuccessful.</p><h3>Are you under 18?</h3><p>If you are an international student who will be under 18 years of age when entering Australia, you will need to confirm you have appropriate accommodation, support and general welfare arrangements in place before you can accept your offer. You will need to meet one of the three requirements below:</p><ul><li>Living with a parent</li><li>Living with a relative</li><li>Other approved care arrangement.</li></ul><p>You can also enrol in the <a href="http://services.unimelb.edu.au/international/under18/supervision-program" target="_blank">University of Melbourne Under 18 Supervision Program</a>. Find out more about <a href="http://services.unimelb.edu.au/international/under18" target="_blank">students under 18</a>.</p></div><hr /><div class="col-1 first step-arrow"><h3>Step 4</h3></div><div class="col-5"><h2><a name="prepare" id="prepare"></a>Preparing for study</h2><div class="col-2" style="float:right; margin-top:10px;"><a href="https://my.unimelb.edu.au" target="_blank"><img src="https://futurestudents.unimelb.edu.au/__data/assets/image/0004/1094539/Student-contact-details-notice.jpg" /></a></div><p>Once you've received and accepted your offer, it's time to get ready to move to Melbourne! You'll need to find a place to live, decide whether you need to work while you study and learn about life in your new city. Below are some helpful resources, including enrolment information, to make the transition easier for you.</p><h3>Visas</h3><p>If you haven't already got your visa to study in Australia, now is the time to do that. All citizens of countries other than New Zealand or Australia need a visa to study here. You should have received information about applying for a student visa with your offer of a place from the University.</p><ul><li><a href="http://services.unimelb.edu.au/international/visas/apply" target="_blank">Applying for a student visa</a></li><li><a href="//services.unimelb.edu.au/international/visas/conditions-and-validity" target="_blank">Student visa conditions</a></li><li><a href="http://services.unimelb.edu.au/international/visas/oshc" target="_blank">Overseas Student Health Cover (OSHC)</a></li></ul><h3>Organising your arrival</h3> Each semester, International Student Services organises pre-departure briefings in a number of countries. All commencing international students and their families are invited to attend the briefings prior to your arrival in Melbourne. This will help you understand more about what life in Melbourne will be like. <ul><li><a href="http://services.unimelb.edu.au/international/planning" target="_blank">Pre-departure briefings</a></li></ul><p>If you can't make it to a pre-departure briefing or there isn't one near you, don't worry. There is a lot of information <a href="http://services.unimelb.edu.au/international/planning" target="_blank">right here</a> that can help you find your way.</p><h3>Accommodation</h3><p>Finding a place to live can be complicated from a distance. Melbourne offers plenty of housing options. Some students choose to live in campus residences, some choose to stay with an Australian family, while most Australian students choose 'share housing', where a number of students live together close to the University.</p><ul><li>Find out more about <a href="https://futurestudents.unimelb.edu.au/explore/accommodation">Accommodation in Melbourne</a></li></ul><p>Need somewhere to stay until longer term housing is available? You can request <a href="https://services.unimelb.edu.au/housing/moving-to-melbourne/temporary-accommodation">temporary accommodation</a> before you arrive in Melbourne. There's also <a href="https://services.unimelb.edu.au/housing">longer term housing</a> available for all students including Study Abroad and Exchange students.</p><ul><li>Read more about <a href="https://services.unimelb.edu.au/housing/moving-to-melbourne">moving to Melbourne</a></li></ul><h3>Enrolment and orientation</h3><p>The first step in your new academic life is enrolling and attending orientation, designed to help ease your entry into campus life. To assist you with your move to Australian tertiary study the <a href="http://studentconnect.unimelb.edu.au" target="_blank">Student Connect website</a> has advice and information to help you understand all aspects of university life, including what happens at <a href="//orientation.unimelb.edu.au" target="_blank">enrolment and orientation</a>.</p><ul><li><a href="http://services.unimelb.edu.au/international/life-and-study" target="_blank">Getting used to a new country</a></li></ul><h3>Work while you study</h3><p>Some students choose to work while they&rsquo;re studying. Student visas allow you to work, however you must comply with the conditions on your visa. For more information, see <a href="http://services.unimelb.edu.au/international/visas/working-while-studying" target="_blank">Work while studying</a></p><h3>Fun while you study</h3> Being at university isn't all hard work. Life on campus can be great fun too! For more information on activities and events outside of classes, take a look at the following: <ul><li><a href="https://futurestudents.unimelb.edu.au/explore/student-experience">Life at Melbourne</a></li><li><a href="http://www.sport.unimelb.edu.au/Clubs" target="_blank">Sports clubs</a></li><li><a href="http://union.unimelb.edu.au/clubs" target="_blank">Clubs and societies</a></li></ul><h3>Leadership and Volunteering</h3><p>Challenge yourself, develop confidence, or enhance your leadership/team and interpersonal skills. Would you like to get involved in the community, connect with others at university and make new friends? How about gaining work experience, going on an adventure or just having fun? If your answer is yes, come and visit&nbsp;<a href="http://equity.unimelb.edu.au/initiatives">Equity and student engagement initiatives</a>.</p><h3>More services</h3><p>Check out all of our fantastic support services to help you out while you study. Our services include: Careers and Employment, Child Care Services, Counselling Service and many more. See <a href="http://services.unimelb.edu.au/" target="_blank">Services for Students</a> for more information.</p></div>
</div>
    </div>"""
                ]))

            degree_name_urls = response.xpath(
                "//span[@class='category']/a/@href|"
                "//div[@class='parent-courses']/a/@href").extract()
            print("degree_name_urls: ", degree_name_urls)
            # if len(degree_name_urls) > 0:
            for link in degree_name_urls:
                # degree_url = "https://coursesearch.unimelb.edu.au" + link
                degree_url = "https://study.unimelb.edu.au" + link
                # print("===", degree_url)
                self.parse_data(degree_url, item)
                yield item
                # print("***")
                # yield scrapy.Request(url, callback=self.parse_data, meta={"programme_en": programme, "overview_en": overview_en, "career_en": career_en, "modules_en": modules})
        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #26

Показать файл

Файл: TheUniversityOfMelbourne_U.py Проект: histudent/python_spider

    def parse_data(self, degree_url, item):
        print("学位类型链接============" + degree_url + "===============")
        data = requests.get(degree_url, headers=self.headers)
        response = etree.HTML(data.text)
        # try:
        degree_name = response.xpath(
            "//div[@class='headline']/h1/text()|//h1[@id='page-header']//text()"
        )
        item['degree_name'] = ''.join(degree_name).strip()
        print("item['degree_name']: ", item['degree_name'])

        department = ''
        if "Bachelor of " in item['degree_name']:
            department = item['degree_name'].replace("Bachelor of", "")
            department = ''.join(department).strip()
        item['department'] = department
        print("item['department']: ", item['department'])

        duration = response.xpath(
            "//div[@class='course-length icn icn-duration']/text()|//li[contains(text(),'full time')]//text()"
        )
        clear_space(duration)
        print("duration:", duration)
        duration_list = getIntDuration(''.join(duration))
        if len(duration_list) == 2:
            item['duration'] = duration_list[0]
            item['duration_per'] = duration_list[-1]
        print("item['duration']: ", item['duration'])
        print("item['duration_per']: ", item['duration_per'])

        location = response.xpath(
            "//div[@class='course-location icn icn-location']//text()|//li[contains(text(),'campus')]//text()"
        )
        item['location'] = ''.join(location).strip()
        print("item['location']: ", item['location'])

        degree_description = response.xpath(
            "//div[@class='primary']//div[@class='description']|"
            "//section[@id='course-overview']//div[@class='course-section__main course-section__main-with-aside']"
        )
        degree_description_str = ""
        if len(degree_description) > 0:
            for deg_desc in degree_description:
                degree_description_str += etree.tostring(deg_desc,
                                                         encoding='unicode',
                                                         method='html')
        item['degree_overview_en'] = remove_class(
            clear_lianxu_space([degree_description_str]))
        print("item['degree_overview_en']: ", item['degree_overview_en'])

        rntry_tuition_fee_url = data.url + ".inline?profile_citizenship=international&profile_qualification=76&profile_year=2019"
        # print("rntry_tuition_fee_url: ", rntry_tuition_fee_url)
        rntry_tuition_fee_list = self.parse_rntry_tuition_fee(
            rntry_tuition_fee_url)
        item['rntry_requirements_en'] = rntry_tuition_fee_list[0]
        print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

        item['tuition_fee'] = getTuition_fee(rntry_tuition_fee_list[1])
        if item['tuition_fee'] == 0:
            item['tuition_fee'] = None
        else:
            item['tuition_fee_pre'] = "AUD$"
        print("item['tuition_fee']: ", item['tuition_fee'])

Пример #27

Показать файл

    def parse(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "The University of New South Wales"
        # item['country'] = 'Australia'
        # item['website'] = 'http://www.unsw.edu.au/'
        item['url'] = response.url
        item['degree_type'] = 1
        # item['teach_time'] = 'coursework'
        print("===========================")
        print(response.url)
        try:
            department = response.xpath(
                "//div[@class='inlinevideo-inner']//div[@class='contentarea-title']/h3//text()"
            ).extract()
            item['department'] = ''.join(department).strip()
            print("item['department']: ", item['department'])

            # 学位类型列表
            degree_type = response.xpath(
                "//section//div[@class='degree js-degree']//h5//text()"
            ).extract()
            clear_space(degree_type)
            print(len(degree_type))
            print("degree_type: ", degree_type)

            duration = response.xpath(
                "//section//dt[contains(text(), 'Minimum years')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(duration)
            print(len(duration))
            print("duration: ", duration)

            start_date = response.xpath(
                "//section//dt[contains(text(), 'Entry')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(start_date)
            print(len(start_date))
            print("start_date: ", start_date)

            tuition_fee = response.xpath(
                "//section//dt[contains(text(), 'Estimated first year tuition')]/following-sibling::*[1]//text()"
            ).extract()
            clear_space(tuition_fee)
            print(len(tuition_fee))
            print("tuition_fee: ", tuition_fee)

            careerEle = response.xpath("//section//dl[last()]")
            print(len(careerEle))
            print("careerEle: ", careerEle)

            for i in range(len(degree_type)):
                print("-------------------" + str(i) + "-----------------")
                item['degree_name'] = degree_type[i]
                print("item['degree_name']: ", item['degree_name'])

                # 课程长度
                item['duration'] = duration[i]
                print("item['duration']: ", item['duration'])

                # 开学时间
                item['start_date'] = start_date[i]
                if "and" in item['start_date']:
                    start_date_sp = item['start_date'].split("and")
                else:
                    start_date_sp = [item['start_date']]
                # print(start_date_sp)
                start_date_str = ""
                for st in start_date_sp:
                    start_date_str += getStartDate(st).replace("0", "") + ","
                item['start_date'] = start_date_str.strip().strip(',').strip()
                print("item['start_date']: ", item['start_date'])

                # 学费
                item['tuition_fee'] = tuition_fee[i].replace(
                    "AUD $", "").replace(",", "").strip()
                print("item['tuition_fee']: ", item['tuition_fee'])

                # print(careerEle[i])
                careerRe = careerEle[i].xpath(
                    ".//dt[contains(text(), 'Career Opportunities')]|.//dt[contains(text(), 'Career Opportunities')]/following-sibling::dd[1]"
                ).extract()
                item['career_en'] = remove_class(clear_lianxu_space(careerRe))
                print("item['career_en']: ", item['career_en'])

                if "Graduate" not in item['degree_name']:
                    yield item

            # programme = response.xpath("//div[@class='internalContentWrapper']/h1[1]//text()").extract()
            # programme = ''.join(programme)
            # programme = programme.split("-")
            # item['programme_en'] = programme[0].strip()
            # print("item['programme_en']: ", item['programme_en'])

            # yield item
        except Exception as e:
            with open(".//scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'w',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #28

Показать файл

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "Central Queensland University"
        # item['country'] = 'Australia'
        # item['website'] = 'https://www.cqu.edu.au'
        item['url'] = response.url
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            programme = response.xpath("//h1[@class='program-title']/text()|"
                                       "//h1[@itemprop='name']//text()").extract()
            clear_space(programme)
            programme = ''.join(programme).split("-")
            # print("programme: ", programme)
            programme_en = response.xpath("//th[contains(text(),'Majors')]/following-sibling::td//text()").extract()
            clear_space(programme_en)
            print("programme_en: ", programme_en)
            if len(programme_en) == 0:
                item['programme_en'] = ''.join(programme[:-1]).replace("Bachelor of", "").strip()
            else:
                item['programme_en'] = ''.join(programme_en).strip()
            print("item['programme_en']: ", item['programme_en'])
            item['degree_name'] = ''.join(programme[:-1])
            print("item['degree_name']: ", item['degree_name'])

            department = response.xpath(
                "//ol[@id='breadcrumbs']/li[4]/a//text()").extract()
            clear_space(department)
            if department:
                item['department'] = ''.join(department)
            print("item['department']: ", item['department'])

            duration = response.xpath(
                "//th[contains(text(),'Duration')]/following-sibling::td[1]//text()|"
                "//span[contains(text(),'DURATION')]/following-sibling::*[1]//text()").extract()
            clear_space(duration)
            item['duration'] = ''.join(duration).strip()
            print("item['duration']: ", item['duration'])

            start_date = response.xpath("//th[contains(text(),'Intake dates')]/following-sibling::td[1]//text()|"
                                        "//strong[contains(text(),'Term dates for 2019')]/..//text()").extract()
            clear_space(start_date)
            if "," in ''.join(start_date):
                start_date = ''.join(start_date).split(",")
            print("start_date: ", start_date)
            if start_date:
                item['start_date'] = ''.join(start_date).strip()

            monthDict = {"january": "01", "february": "02", "march": "03", "april": "04", "may": "05", "june": "06",
                         "july": "07", "august": "08", "september": "09", "october": "10", "november": "11",
                         "december": "12",
                         "jan": "01", "feb": "02", "mar": "03", "apr": "04", "may": "05", "jun": "06",
                         "jul": "07", "aug": "08", "sep": "09", "oct": "10", "nov": "11", "dec": "12",
                         "sept": "09", }
            std = []
            if len(start_date) > 0:
                for s in start_date:
                    std_tmp = monthDict.get(s.lower().strip())
                    if std_tmp is not None:
                        std.append(std_tmp)
            # if std:
            # item['start_date'] = ','.join(std).replace("0", "").strip().strip(",").strip()
            print("item['start_date']: ", item['start_date'])

            # //div[@class='careers']
            career = response.xpath(
                "//div[@class='careers']|"
                "//span[@class='ct-accordion__title'][contains(text(),'Career Opportunities and Outcomes')]/../..").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            # //p[@itemprop='description']/following-sibling::p
            degree_overview_en = response.xpath(
                "//p[@itemprop='description']|//p[@itemprop='description']/following-sibling::p").extract()
            item['degree_overview_en'] = remove_class(clear_lianxu_space(degree_overview_en))
            # print("item['degree_overview_en']: ", item['degree_overview_en'])

            overview1 = response.xpath(
                "//div[@class='tab-content active']/p|//div[@class='tab details-tab']|//span[@class='ct-accordion__title'][contains(text(),'Course Details')]/../..").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview1))
            # print("item['overview_en']: ", item['overview_en'])

            modules_url = response.xpath("//div[@class='tab structure-tab']//a[contains(text(),'click here')]/@href|"
                                         "//a[contains(text(),'Handbook')]/@href").extract()
            print(len(modules_url))
            if len(modules_url) > 0:
                item['modules_en'] = self.parse_modules(modules_url[0])
            print("item['modules_en']: ", item['modules_en'])

            entry_requirements = response.xpath(
                "//div[@class='tab entry-reqs-tab']|"
                "//div[@class='tab entry-reqs-tab']|//span[@class='ct-accordion__title'][contains(text(),'Entry Requirements')]/../..").extract()
            item['rntry_requirements_en'] = remove_class(clear_lianxu_space(entry_requirements))
            print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

            # //html//div[@class='tab entry-reqs-tab']//tr[1]
            IELTS = response.xpath(
                "//td[contains(text(),'IELTS Academic')]/following-sibling::td[1]//text()|"
                "//div[@class='tab entry-reqs-tab']|//span[@class='ct-accordion__title'][contains(text(),'Entry Requirements')]/../..//text()").extract()
            clear_space(IELTS)
            ielts_re = re.findall(r"International\sEnglish\sLanguage\sTesting\sSystem\s\(IELTS\sAcademic\).*?\sor", ''.join(IELTS))
            # print("ielts:re: ", ielts_re)
            if ielts_re:
                item['ielts_desc'] = ''.join(ielts_re)
                ieltsDict = get_ielts(item['ielts_desc'])
                item['ielts'] = ieltsDict.get("IELTS")
                item['ielts_l'] = ieltsDict.get("IELTS_L")
                item['ielts_s'] = ieltsDict.get("IELTS_S")
                item['ielts_r'] = ieltsDict.get("IELTS_R")
                item['ielts_w'] = ieltsDict.get("IELTS_W")
            print("item['ielts_desc']: ", item['ielts_desc'])

            TOEFL = response.xpath(
                "//td[contains(text(),'TOEFL Internet-based')]/following-sibling::td[1]//text()").extract()
            clear_space(TOEFL)

            TOEFL_re = re.findall(r"TOEFL\siBT.*?\sor|.{0,51}Internet\sBased\sTest.*?\sor", ''.join(IELTS))
            if TOEFL_re:
                item['toefl_desc'] = ''.join(TOEFL_re)
                toeflDict = get_toefl(item['toefl_desc'])
                item['toefl'] = toeflDict.get("TOEFL")
                item['toefl_l'] = toeflDict.get("TOEFL_L")
                item['toefl_s'] = toeflDict.get("TOEFL_S")
                item['toefl_r'] = toeflDict.get("TOEFL_R")
                item['toefl_w'] = toeflDict.get("TOEFL_W")
            print("item['toefl_desc']: ", item['toefl_desc'])


            print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))
            print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
                    item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            # //div[@class='tab fees-tab']//div[@class='tab-content']//h4
            tuition_fee = response.xpath(
                "//div[@class='tab fees-tab']//div[@class='tab-content']//h4//text()").extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"Estimated\sfirst\syear\sfee.*", ','.join(tuition_fee))
            tuition_fee_re1 = re.findall(r"[\d\s]+", ' '.join(tuition_fee_re))
            item['tuition_fee'] = ''.join(tuition_fee_re1).replace(" ", "").strip()
            # print("item['tuition_fee']: ", item['tuition_fee'])

            # //div[@class='tab apply-tab']
            apply_desc_en = response.xpath(
                "//div[@class='tab apply-tab']|"
                "//span[@class='ct-accordion__title'][contains(text(),'How to Apply')]/../..").extract()
            item['apply_desc_en'] = remove_class(clear_lianxu_space(apply_desc_en))
            # print("item['apply_desc_en']: ", item['apply_desc_en'])

            apply_documents_en = response.xpath(
                "//div[contains(text(),'What type of supporting documents do I have to pro')]/..").extract()
            item['apply_documents_en'] = remove_class(clear_lianxu_space(apply_documents_en))
            # print("item['apply_documents_en']: ", item['apply_documents_en'])


            if "/" not in item['degree_name']:
                if "Diploma" not in item['degree_name']:
                    # 判断是否支持国际招生
                    international = response.xpath("//a[@class='tabs-button active']/following-sibling::a[contains(text(), 'INTERNATIONAL')]//text()").extract()
                    print("internation == ", international)
                    # if international:
                    location = response.xpath(
                        "//span[contains(text(),'AVAILABILITY')]/following-sibling::p[1]//text()").extract()
                    clear_space(location)
                    print("location: ", location)
                    if location:
                        item['location'] = ''.join(location).strip().strip(",").strip()
                    print("item['location']: ", item['location'])

                    # 筛选含有多个专业的学位
                    major_list = response.xpath("//th[contains(text(),'Majors')]/../following-sibling::tr/td[1]//text()").extract()
                    print("major_list: ", major_list)
                    if major_list:
                        for major in major_list:
                            item['programme_en'] = major.strip()
                            yield item
                    else:
                        yield item

        except Exception as e:
            with open("scrapySchool_Australian_ben/error/"+item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #29

Показать файл

Файл: BondUniversity_U.py Проект: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "Bond University"
        # item['country'] = 'Australia'
        # item['website'] = 'https://bond.edu.au'
        item['url'] = response.url
        item['degree_type'] = 1
        item['major_type1'] = response.meta.get(response.url)
        print("===========================")
        print(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            degree_type = response.xpath(
                "//h1[@class='page-title']//text()").extract()
            clear_space(degree_type)
            degree_type = ''.join(degree_type)
            item['degree_name'] = degree_type
            print("item['degree_name']: ", item['degree_name'])
            programme = degree_type
            if "(Business)" in degree_type:
                item['programme_en'] = "Business"
            else:
                item['programme_en'] = degree_type.replace("Bachelor of",
                                                           "").strip()
            print("item['programme_en']: ", item['programme_en'])

            other = response.xpath(
                "//html//article/blockquote[1]//text()").extract()
            item['other'] = clear_lianxu_space(other)
            # print("item['other']: ", item['other'])

            overview = response.xpath(
                "//html//article/section[@class='section'][1]").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # if item['overview_en'] == "":
            #     print("***overview_en为空")
            print("item['overview_en']: ", item['overview_en'])

            degree_description = response.xpath(
                "//div[@id='show-less-0']|//section[@id='accordion-program']/p"
            ).extract()
            item['degree_overview_en'] = remove_class(
                clear_lianxu_space(degree_description))
            # if item['degree_overview_en'] == "":
            #     print("***degree_overview_en为空")
            # print("item['degree_overview_en']: ", item['degree_overview_en'])

            # //html//section[@id='accordion-program']/div[@class='table-responsive']//tr[2]/td[2]
            duration = response.xpath(
                "//strong[contains(text(),'Duration')]/../following-sibling::td[1]//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration = ', '.join(duration)
            duration_re = re.findall(r"\d\ssemesters|\d\ssemester", duration)
            if len(duration_re) > 0:
                for d in duration_re:
                    item['duration'] = duration.replace(d, "").replace(
                        "(", "").replace(")", "").strip()
            else:
                item['duration'] = duration.replace("(",
                                                    "").replace(")",
                                                                "").strip()
            print("item['duration']: ", item['duration'])

            start_date = response.xpath(
                "//strong[contains(text(),'Starting semesters')]/../following-sibling::td[1]//text()"
            ).extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            monthDict = {
                "january": "01",
                "february": "02",
                "march": "03",
                "april": "04",
                "may": "05",
                "june": "06",
                "july": "07",
                "august": "08",
                "september": "09",
                "october": "10",
                "november": "11",
                "december": "12",
                "jan": "01",
                "feb": "02",
                "mar": "03",
                "apr": "04",
                "may": "05",
                "jun": "06",
                "jul": "07",
                "aug": "08",
                "sep": "09",
                "oct": "10",
                "nov": "11",
                "dec": "12",
                "sept": "09",
            }
            std = []
            start_date_re = re.findall(
                r"january|february|march|april|may|june|july|august|september|october|november|december",
                ','.join(start_date), re.I)
            # print(start_date_re)
            if len(start_date_re) > 0:
                for s in start_date_re:
                    std_tmp = monthDict.get(s.lower())
                    if std_tmp is not None:
                        std.append(std_tmp)
            std = list(set(std))
            item['start_date'] = ','.join(std).replace(
                "0", "").strip().strip(",").strip()
            print("item['start_date']: ", item['start_date'])

            career = response.xpath(
                "//div[@id='collapse-field_pgm_prof_out']|//a[@class='collapsed'][contains(text(),'Professional outcomes')]/../../.."
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # if item['career_en'] == "":
            #     print("***career_en为空")
            print("item['career_en']: ", item['career_en'])

            modules = response.xpath(
                "//div[@id='collapse-field_pgm_str_sub']|//a[@class='collapsed'][contains(text(),'Structure and subjects')]/../../.."
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # if item['modules_en'] == "":
            #     print("***modules_en为空")
            # print("item['modules_en']: ", item['modules_en'])

            tuition_fee = response.xpath(
                "//span[contains(@data-prefix,'Program fees 2019:')]//text()|//strong[contains(text(),'Program fees 2019')]/../text()|"
                "//strong[contains(text(),'2019 fees:')]/../text()").extract(
                )  # 2019.03.20 星期三
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"\d+,\d+", ''.join(tuition_fee))
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = tuition_fee_re[0].replace(",",
                                                                "").strip()
            print("item['tuition_fee']: ", item['tuition_fee'])

            entry_requirements = response.xpath(
                "//div[@id='collapse-field_pgm_ent_req']|//a[@data-toggle='collapse'][contains(text(),'Entry requirements')]/../../..|"
                "//h4[contains(text(),'Academic requirements')]/..").extract(
                )  # 2019.03.20 星期三
            item['rntry_requirements_en'] = remove_class(
                clear_lianxu_space(entry_requirements))
            if item['rntry_requirements_en'] == "":
                print("***rntry_requirements_en为空")
            # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

            # "https://bond.edu.au/intl/future-students/bond-international/information-international-students/international-english-language-testing-requirements"
            ielt_desc_dict = {
                "Bachelor of Business Law":
                "IELTS score 7.0 No sub score less than 6.5",
                "Bachelor of Jurisprudence":
                "IELTS score 7.0 No sub score less than 6.5",
                "Bachelor of Medical Studies (BMedSt) and the Doctor of Medicine (MD)":
                "IELTS score 7.0 No sub score less than 6.5",
                "Bachelor of Laws":
                "IELTS score 7.0 No sub score less than 6.5",
                "Bachelor of Psychological Science":
                "IELTS score 7.0 No sub score less than 6.5",
                "Bachelor of Psychological Science (Honours)":
                "IELTS score 7.0 No sub score less than 6.5",
                "Bachelor of Arts":
                "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                "Bachelor of Communication":
                "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                "Bachelor of Communication (Business)":
                "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                "Bachelor of Film and Television":
                "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                "Bachelor of Film and Television (3 Year Program)":
                "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                "Bachelor of Global Studies (Sustainability)":
                "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                "Bachelor of International Relations":
                "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                "Bachelor of Interactive Media and Design":
                "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                "Bachelor of Journalism":
                "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                "Bachelor of Social Science":
                "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                "Bachelor of Architectural Studies":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Construction Management and Quantity Surveying":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Sustainable Environments and Planning":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Arts":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Communication":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Communication (Business)":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Film and Television":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Film and Television (3 Year Program)":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Global Studies (Sustainability)":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of International Relations":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Interactive Media and Design":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Journalism":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Jurisprudence":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Laws":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Social Science":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Actuarial Science":
                "IELTS score 6.0 No sub score less than 6.0",
            }
            item['ielts_desc'] = ielt_desc_dict.get(item['degree_name'])
            print("item['ielts_desc']: ", item['ielts_desc'])

            if item['ielts_desc'] is not None:
                ieltlsrw = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
                if len(ieltlsrw) == 2:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[1]
                    item["ielts_s"] = ieltlsrw[1]
                    item["ielts_r"] = ieltlsrw[1]
                    item["ielts_w"] = ieltlsrw[1]
                elif len(ieltlsrw) == 1:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[0]
                    item["ielts_s"] = ieltlsrw[0]
                    item["ielts_r"] = ieltlsrw[0]
                    item["ielts_w"] = ieltlsrw[0]
                elif len(ieltlsrw) == 5:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[2]
                    item["ielts_s"] = ieltlsrw[2]
                    item["ielts_r"] = ieltlsrw[2]
                    item["ielts_w"] = ieltlsrw[1]
            print(
                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            item['apply_desc_en'] = remove_class(
                clear_lianxu_space([
                    """<section class="section" id="section-8551"> <a id="application" name="application" class="anchor" ></a><h2 class="field field-name-field-title field-type-text field-label-hidden"> Application essentials</h2><div class="panel-group" id="accordion-8552" role="tablist" aria-multiselectable="true"><div class="panel panel-default"><div class="panel-heading"><h4 class="panel-title"> <a class="collapsed" data-toggle="collapse" data-parent="#accordion-8552" href="#collapse-8553"> Application process </a></h4></div><div id="collapse-8553" class="panel-collapse collapse"><div class="panel-body"><h3>Australian students</h3><p>Applications for most Bond programs can be lodged at any time directly to the University. With the exception of the <a href="https://bond.edu.au/intl/future-students/study-bond/search-program/medicine-bond">Medical Program</a>, you do not need to go through QTAC and your Bond application will not affect your QTAC application for other university programs. Apply direct to Bond University via our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>.</p><h3>International students - full degree</h3><p>Apply directly to Bond using our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a> or through a representative in your country.</p><h3>English language students</h3><p>Apply to study English at Bond University College, located on the Bond University campus, by selecting your desired program below:</p><ul><li><a href="https://apply.bond.edu.au/">English for Academic Purposes</a></li><li><a href="https://college.bond.edu.au/apply-english">General English</a></li></ul><h3>Diploma, university preparation or foundation program students</h3><p>You can apply for your academic pathway through our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>.</p><h3>Outbound exchange - Bond students</h3><p>Undergraduate Bond students must have completed two semesters prior to application while postgraduate Bondies can apply from their first semesters. Applications received in first and second semesters will be pending GPA requirements of 65% and above. Find out how to <a href="https://bond.edu.au/intl/future-students/bond-international/semester-abroad-exchange/outbound-bond">apply for exchange</a>.</p><h3>Inbound exchange students</h3><p>If your home institution has a formal exchange agreement with Bond, you will need to apply through them via their outbound exchange student application process. Your university may set certain academic performance standards for you to qualify for the program. Providing you meet their criteria, your home institution will contact Bond to nominate you as an exchange student and we will contact you to advise that your nomination has been successful. You will then be able to apply to Bond using our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>, which must be accompanied by the required documentation. Exchange students pay their regular tuition fees to their home institution – not to Bond University.</p><h3>Study abroad students</h3><p>Firstly obtain approval from your home institution, then apply directly to Bond using our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>; through a study abroad representative in your country; or through your home university if applicable.</p><ul></ul></div></div></div><div class="panel panel-default"><div class="panel-heading"><h4 class="panel-title"> <a class="collapsed" data-toggle="collapse" data-parent="#accordion-8552" href="#collapse-8554"> When can you start? </a></h4></div><div id="collapse-8554" class="panel-collapse collapse"><div class="panel-body"><p>Bond University runs three full semesters each year with intakes in January (Semester 1), May (Semester 2) and September (Semester 3). Our semesters are scheduled to coordinate with the Northern Hemisphere school/university timetables. (You’ll find that most other Australian universities offer only two semesters a year, meaning that you may have to wait until February or July before you can start your international studies.)</p></div></div></div><div class="panel panel-default"><div class="panel-heading"><h4 class="panel-title"> <a class="collapsed" data-toggle="collapse" data-parent="#accordion-8552" href="#collapse-8555"> Admissions criteria </a></h4></div><div id="collapse-8555" class="panel-collapse collapse"><div class="panel-body"><p>Bond University is committed to open and transparent admission processes, and to providing detailed information about the options and entry criteria that are relevant for you. </p><p>Learn more about our <a href="https://bond.edu.au/intl/future-students/study-bond/how-apply/undergraduate-admissions-criteria">undergraduate admissions criteria</a>. If you have further questions or wish to speak to one of our advisors, contact the <a href="https://bond.edu.au/intl/contact#ofs">Office of Future Students</a>.</p><p>For postgraduate study, the entry requirements are unique to each individual program. <a href="https://bond.edu.au/intl/future-students/study-bond/search-program#postgraduate">Search for your program</a> of interest to find out the specific entry requirements. </p></div></div></div><div class="panel panel-default"><div class="panel-heading"><h4 class="panel-title"> <a class="collapsed" data-toggle="collapse" data-parent="#accordion-8552" href="#collapse-8567"> Academic and English language entry requirements </a></h4></div><div id="collapse-8567" class="panel-collapse collapse"><div class="panel-body"><p>In addition to any performance standards stipulated by your home institution, you will also need to meet Bond’s academic and <a href="https://bond.edu.au/intl/future-students/bond-international/information-international-students/english-language-requirements">English language</a> requirements for the study program you have chosen.</p><p>If you need extra instruction, Bond offers <a href="https://college.bond.edu.au/english-at-bond">English classes</a> on campus through Bond University College, as well as a <a href="https://bond.edu.au/intl/program/bond-university-college-foundation-program">Foundation Program</a> to prepare you for university studies in Australia.</p></div></div></div></div></section>"""
                ]))
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="panel-group" id="accordion-8552" role="tablist" aria-multiselectable="true"><div class="panel panel-default"><div class="panel-heading"><h4 class="panel-title"> <a class="collapsed" data-toggle="collapse" data-parent="#accordion-8552" href="#collapse-8553"> Application process </a></h4></div><div id="collapse-8553" class="panel-collapse collapse"><div class="panel-body"><h3>Australian students</h3><p>Applications for most Bond programs can be lodged at any time directly to the University. With the exception of the <a href="https://bond.edu.au/intl/future-students/study-bond/search-program/medicine-bond">Medical Program</a>, you do not need to go through QTAC and your Bond application will not affect your QTAC application for other university programs. Apply direct to Bond University via our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>.</p><h3>International students - full degree</h3><p>Apply directly to Bond using our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a> or through a representative in your country.</p><h3>English language students</h3><p>Apply to study English at Bond University College, located on the Bond University campus, by selecting your desired program below:</p><ul><li><a href="https://apply.bond.edu.au/">English for Academic Purposes</a></li><li><a href="https://college.bond.edu.au/apply-english">General English</a></li></ul><h3>Diploma, university preparation or foundation program students</h3><p>You can apply for your academic pathway through our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>.</p><h3>Outbound exchange - Bond students</h3><p>Undergraduate Bond students must have completed two semesters prior to application while postgraduate Bondies can apply from their first semesters. Applications received in first and second semesters will be pending GPA requirements of 65% and above. Find out how to <a href="https://bond.edu.au/intl/future-students/bond-international/semester-abroad-exchange/outbound-bond">apply for exchange</a>.</p><h3>Inbound exchange students</h3><p>If your home institution has a formal exchange agreement with Bond, you will need to apply through them via their outbound exchange student application process. Your university may set certain academic performance standards for you to qualify for the program. Providing you meet their criteria, your home institution will contact Bond to nominate you as an exchange student and we will contact you to advise that your nomination has been successful. You will then be able to apply to Bond using our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>, which must be accompanied by the required documentation. Exchange students pay their regular tuition fees to their home institution – not to Bond University.</p><h3>Study abroad students</h3><p>Firstly obtain approval from your home institution, then apply directly to Bond using our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>; through a study abroad representative in your country; or through your home university if applicable.</p><ul></ul></div></div></div></div>"""
                ]))
            # print(item)
            # print("+++", "Graduate" not in item['degree_name'])
            if "/" not in item['degree_name'] or "/" not in item[
                    'degree_name'] and "online" not in item[
                        'degree_name'].lower():
                print("++++++++++++")
                major_list = response.xpath(
                    "//a[@id='majors']/following-sibling::div//div/div/h4/a//text()"
                ).extract()
                clear_space(major_list)
                print("major_list: ", major_list)
                print(len(major_list))

                if len(major_list) == 0:
                    yield item
                else:
                    modules_list = response.xpath(
                        "//a[@id='majors']/following-sibling::div//div/div/div"
                    ).extract()
                    print("===", modules_list)
                    print(len(modules_list))
                    if len(modules_list) == len(major_list):
                        for m in range(len(major_list)):
                            item['programme_en'] = major_list[m]
                            item['modules_en'] = remove_class(
                                clear_lianxu_space([modules_list[m]]))
                            print("item['programme_en']: ",
                                  item['programme_en'])
                            yield item

        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Пример #30

Показать файл

Файл: TheUniversityOfNewSouthWales_U_handbook2019.py Проект: histudent/python_spider

    def parse(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "The University of New South Wales"
        item['url'] = response.url
        item['degree_type'] = 1
        # item['teach_time'] = 'coursework'
        print("===========================")
        print(response.url)
        try:
            department = response.xpath("//li[3]//a//text()").extract()
            item['department'] = ''.join(department).strip()
            print("item['department']: ", item['department'])

            location = response.xpath(
                "//div[@role='complementary']//strong[@tabindex='0'][contains(text(),'Campus')]/../p//text()"
            ).extract()
            item['location'] = ''.join(location).strip()
            print("item['location']: ", item['location'])

            duration = response.xpath(
                "//div[contains(@role,'complementary')]//strong[contains(@tabindex,'0')][contains(text(),'Typical duration')]/../p//text()"
            ).extract()
            clear_space(duration)
            print("duration: ", duration)
            if "Years" in ''.join(duration):
                item['duration'] = ''.join(duration).replace("Years",
                                                             "").strip()
                item['duration_per'] = 1
            print("item['duration']: ", item['duration'])

            # //div[@id='readMoreToggle1']
            overview_en = response.xpath(
                "//div[@id='readMoreToggle1']/div[1]").extract()
            item['degree_overview_en'] = item['overview_en'] = remove_class(
                clear_lianxu_space(overview_en))
            print("item['overview_en']: ", item['overview_en'])

            item["rntry_requirements_en"] = None
            rntry_requirements_en = response.xpath(
                "//div[@class='m-accordion-group m-accordion-with-header']//div[@class='m-accordion-body']|"
                "//strong[@aria-label='Progression Requirements']/../../following-sibling::div"
            ).extract()
            if rntry_requirements_en:
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space(rntry_requirements_en))
            print("item['rntry_requirements_en']: ",
                  item['rntry_requirements_en'])

            modules_en = response.xpath(
                "//div[@id='structure']/div[position()<last()]").extract()
            if modules_en:
                item['modules_en'] = remove_class(
                    clear_lianxu_space(modules_en))
            print("item['modules_en']: ", item['modules_en'])

            career_en = response.xpath(
                "//strong[@aria-label='Career Opportunities']/../../following-sibling::div"
            ).extract()
            if career_en:
                item['career_en'] = remove_class(clear_lianxu_space(career_en))
            print("item['career_en']: ", item['career_en'])

            # start_date = response.xpath(
            #     "//section//dt[contains(text(), 'Entry')]/following-sibling::dd[1]//text()").extract()
            # clear_space(start_date)
            # print(len(start_date))
            # print("start_date: ", start_date)
            #
            # tuition_fee = response.xpath(
            #     "//section//dt[contains(text(), 'Estimated first year tuition')]/following-sibling::*[1]//text()").extract()
            # clear_space(tuition_fee)
            # print(len(tuition_fee))
            # print("tuition_fee: ", tuition_fee)

            # 学位类型列表
            degree_name = response.xpath(
                "//div[@role='complementary']//p[contains(text(),'Bachelor of')]/text()|"
                "//div[@role='complementary']//p[contains(text(),'Juris Doctor')]/text()"
            ).extract()
            clear_space(degree_name)
            if len(degree_name) > 0:
                item['degree_name'] = ', '.join(degree_name).replace(
                    "-", "").strip()
            else:
                item['degree_name'] = None
            print("item['degree_name']: ", item['degree_name'])

            programme_list = response.xpath(
                '//div[@data-hbui-filter-item="specialisation"]/a/div/p//text()|'
                '//h4[contains(text(),"Home Majors and Minors")]/../../following-sibling::div//div[@data-hbui-filter-item="major"]/a[contains(@href, "/undergraduate/specialisations/2019/")]/div[last()]/p//text()|'
                '//h4[contains(text(),"Business Majors")]/../../following-sibling::div//div[@data-hbui-filter-item="major"]/a[contains(@href, "/undergraduate/specialisations/2019/")]/div[last()]/p//text()|'
                '//h4[contains(text(),"Optional Minor or Second Major (International)")]/../../following-sibling::div//div[@data-hbui-filter-item="major"]/a[contains(@href, "/undergraduate/specialisations/2019/")]/div[last()]/p//text()|'
                '//h4[contains(text(),"Economics Majors")]/../../following-sibling::div//div[@data-hbui-filter-item="major"]/a[contains(@href, "/undergraduate/specialisations/2019/")]/div[last()]/p//text()|'
                '//h4[contains(text(),"Specialisation")]/../../following-sibling::div//div[@data-hbui-filter-item="major"]/a[contains(@href, "/undergraduate/specialisations/2019/")]/div[last()]/p//text()|'
                '//h4[contains(text(),"Specialisation")]/../../following-sibling::div//div[@data-hbui-filter-item="honours"]/a[contains(@href, "/undergraduate/specialisations/2019/")]/div[last()]/p//text()|'
                '//h4[contains(text(),"Major")]/../../following-sibling::div//div[@data-hbui-filter-item="major"]/a[contains(@href, "/undergraduate/specialisations/2019/")]/div[last()]/p//text()|'
                '//h4[contains(text(),"Major")]/../../following-sibling::div//div[@data-hbui-filter-item="honours"]/a[contains(@href, "/undergraduate/specialisations/2019/")]/div[last()]/p//text()|'
                '//h4[contains(text(),"major")]/../../following-sibling::div//div[@data-hbui-filter-item="major"]/a[contains(@href, "/undergraduate/specialisations/2019/")]/div[last()]/p//text()'
            ).extract()
            if len(programme_list) == 0:
                programme_list = response.xpath(
                    '//h4[contains(text(),"Major")]/../../following-sibling::div//div[@data-hbui-filter-item="honours"]/a[contains(@href, "/undergraduate/specialisations/2019/")]/div[last()]/p//text()'
                ).extract()
            print("programme_list: ", programme_list)

            programme_list = list(set(programme_list))
            if item['degree_name'] is None:
                pass
            else:
                if len(programme_list) == 0:
                    programme_en = response.xpath(
                        "//span[@data-hbui='module-title']//text()"
                    ).extract_first(None)
                    print("programmen: ", programme_en)
                    item['programme_en'] = programme_en
                    yield item
                else:
                    for prog in programme_list:
                        item['programme_en'] = prog
                        yield item
        except Exception as e:
            with open(".//scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'w',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)