Exemplos de get_item em Python, exemplos de scrapySchool_Australian_ben.getItem.get_item em Python

Exemplo n.º 1

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "Central Queensland University"
        # item['country'] = 'Australia'
        # item['website'] = 'https://www.cqu.edu.au'
        item['url'] = response.url
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            programme = response.xpath("//h1[@class='program-title']/text()|"
                                       "//h1[@itemprop='name']//text()").extract()
            clear_space(programme)
            programme = ''.join(programme).split("-")
            # print("programme: ", programme)
            programme_en = response.xpath("//th[contains(text(),'Majors')]/following-sibling::td//text()").extract()
            clear_space(programme_en)
            print("programme_en: ", programme_en)
            if len(programme_en) == 0:
                item['programme_en'] = ''.join(programme[:-1]).replace("Bachelor of", "").strip()
            else:
                item['programme_en'] = ''.join(programme_en).strip()
            print("item['programme_en']: ", item['programme_en'])
            item['degree_name'] = ''.join(programme[:-1])
            print("item['degree_name']: ", item['degree_name'])

            department = response.xpath(
                "//ol[@id='breadcrumbs']/li[4]/a//text()").extract()
            clear_space(department)
            if department:
                item['department'] = ''.join(department)
            print("item['department']: ", item['department'])

            duration = response.xpath(
                "//th[contains(text(),'Duration')]/following-sibling::td[1]//text()|"
                "//span[contains(text(),'DURATION')]/following-sibling::*[1]//text()").extract()
            clear_space(duration)
            item['duration'] = ''.join(duration).strip()
            print("item['duration']: ", item['duration'])

            start_date = response.xpath("//th[contains(text(),'Intake dates')]/following-sibling::td[1]//text()|"
                                        "//strong[contains(text(),'Term dates for 2019')]/..//text()").extract()
            clear_space(start_date)
            if "," in ''.join(start_date):
                start_date = ''.join(start_date).split(",")
            print("start_date: ", start_date)
            if start_date:
                item['start_date'] = ''.join(start_date).strip()

            monthDict = {"january": "01", "february": "02", "march": "03", "april": "04", "may": "05", "june": "06",
                         "july": "07", "august": "08", "september": "09", "october": "10", "november": "11",
                         "december": "12",
                         "jan": "01", "feb": "02", "mar": "03", "apr": "04", "may": "05", "jun": "06",
                         "jul": "07", "aug": "08", "sep": "09", "oct": "10", "nov": "11", "dec": "12",
                         "sept": "09", }
            std = []
            if len(start_date) > 0:
                for s in start_date:
                    std_tmp = monthDict.get(s.lower().strip())
                    if std_tmp is not None:
                        std.append(std_tmp)
            # if std:
            # item['start_date'] = ','.join(std).replace("0", "").strip().strip(",").strip()
            print("item['start_date']: ", item['start_date'])

            # //div[@class='careers']
            career = response.xpath(
                "//div[@class='careers']|"
                "//span[@class='ct-accordion__title'][contains(text(),'Career Opportunities and Outcomes')]/../..").extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            # //p[@itemprop='description']/following-sibling::p
            degree_overview_en = response.xpath(
                "//p[@itemprop='description']|//p[@itemprop='description']/following-sibling::p").extract()
            item['degree_overview_en'] = remove_class(clear_lianxu_space(degree_overview_en))
            # print("item['degree_overview_en']: ", item['degree_overview_en'])

            overview1 = response.xpath(
                "//div[@class='tab-content active']/p|//div[@class='tab details-tab']|//span[@class='ct-accordion__title'][contains(text(),'Course Details')]/../..").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview1))
            # print("item['overview_en']: ", item['overview_en'])

            modules_url = response.xpath("//div[@class='tab structure-tab']//a[contains(text(),'click here')]/@href|"
                                         "//a[contains(text(),'Handbook')]/@href").extract()
            print(len(modules_url))
            if len(modules_url) > 0:
                item['modules_en'] = self.parse_modules(modules_url[0])
            print("item['modules_en']: ", item['modules_en'])

            entry_requirements = response.xpath(
                "//div[@class='tab entry-reqs-tab']|"
                "//div[@class='tab entry-reqs-tab']|//span[@class='ct-accordion__title'][contains(text(),'Entry Requirements')]/../..").extract()
            item['rntry_requirements_en'] = remove_class(clear_lianxu_space(entry_requirements))
            print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

            # //html//div[@class='tab entry-reqs-tab']//tr[1]
            IELTS = response.xpath(
                "//td[contains(text(),'IELTS Academic')]/following-sibling::td[1]//text()|"
                "//div[@class='tab entry-reqs-tab']|//span[@class='ct-accordion__title'][contains(text(),'Entry Requirements')]/../..//text()").extract()
            clear_space(IELTS)
            ielts_re = re.findall(r"International\sEnglish\sLanguage\sTesting\sSystem\s\(IELTS\sAcademic\).*?\sor", ''.join(IELTS))
            # print("ielts:re: ", ielts_re)
            if ielts_re:
                item['ielts_desc'] = ''.join(ielts_re)
                ieltsDict = get_ielts(item['ielts_desc'])
                item['ielts'] = ieltsDict.get("IELTS")
                item['ielts_l'] = ieltsDict.get("IELTS_L")
                item['ielts_s'] = ieltsDict.get("IELTS_S")
                item['ielts_r'] = ieltsDict.get("IELTS_R")
                item['ielts_w'] = ieltsDict.get("IELTS_W")
            print("item['ielts_desc']: ", item['ielts_desc'])

            TOEFL = response.xpath(
                "//td[contains(text(),'TOEFL Internet-based')]/following-sibling::td[1]//text()").extract()
            clear_space(TOEFL)

            TOEFL_re = re.findall(r"TOEFL\siBT.*?\sor|.{0,51}Internet\sBased\sTest.*?\sor", ''.join(IELTS))
            if TOEFL_re:
                item['toefl_desc'] = ''.join(TOEFL_re)
                toeflDict = get_toefl(item['toefl_desc'])
                item['toefl'] = toeflDict.get("TOEFL")
                item['toefl_l'] = toeflDict.get("TOEFL_L")
                item['toefl_s'] = toeflDict.get("TOEFL_S")
                item['toefl_r'] = toeflDict.get("TOEFL_R")
                item['toefl_w'] = toeflDict.get("TOEFL_W")
            print("item['toefl_desc']: ", item['toefl_desc'])


            print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))
            print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
                    item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            # //div[@class='tab fees-tab']//div[@class='tab-content']//h4
            tuition_fee = response.xpath(
                "//div[@class='tab fees-tab']//div[@class='tab-content']//h4//text()").extract()
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"Estimated\sfirst\syear\sfee.*", ','.join(tuition_fee))
            tuition_fee_re1 = re.findall(r"[\d\s]+", ' '.join(tuition_fee_re))
            item['tuition_fee'] = ''.join(tuition_fee_re1).replace(" ", "").strip()
            # print("item['tuition_fee']: ", item['tuition_fee'])

            # //div[@class='tab apply-tab']
            apply_desc_en = response.xpath(
                "//div[@class='tab apply-tab']|"
                "//span[@class='ct-accordion__title'][contains(text(),'How to Apply')]/../..").extract()
            item['apply_desc_en'] = remove_class(clear_lianxu_space(apply_desc_en))
            # print("item['apply_desc_en']: ", item['apply_desc_en'])

            apply_documents_en = response.xpath(
                "//div[contains(text(),'What type of supporting documents do I have to pro')]/..").extract()
            item['apply_documents_en'] = remove_class(clear_lianxu_space(apply_documents_en))
            # print("item['apply_documents_en']: ", item['apply_documents_en'])


            if "/" not in item['degree_name']:
                if "Diploma" not in item['degree_name']:
                    # 判断是否支持国际招生
                    international = response.xpath("//a[@class='tabs-button active']/following-sibling::a[contains(text(), 'INTERNATIONAL')]//text()").extract()
                    print("internation == ", international)
                    # if international:
                    location = response.xpath(
                        "//span[contains(text(),'AVAILABILITY')]/following-sibling::p[1]//text()").extract()
                    clear_space(location)
                    print("location: ", location)
                    if location:
                        item['location'] = ''.join(location).strip().strip(",").strip()
                    print("item['location']: ", item['location'])

                    # 筛选含有多个专业的学位
                    major_list = response.xpath("//th[contains(text(),'Majors')]/../following-sibling::tr/td[1]//text()").extract()
                    print("major_list: ", major_list)
                    if major_list:
                        for major in major_list:
                            item['programme_en'] = major.strip()
                            yield item
                    else:
                        yield item

        except Exception as e:
            with open("scrapySchool_Australian_ben/error/"+item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 2

0

Exibir arquivo

    def parse(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "Macquarie University"
        # item['country'] = 'Australia'
        # item['website'] = 'https://www.mq.edu.au'
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        informationUrl = response.url.replace(
            "https://api.coursefinder.mq.edu.au/api",
            "http://courses.mq.edu.au").replace('?studentType=international",',
                                                '",')
        print("------------", informationUrl)
        item['url'] = informationUrl
        try:
            jsonData = response.body
            informationDict = json.loads(jsonData)
            # print(informationDict)

            # programme_dict_all = {"degree_name": degree_name, "programme_en": programme_en, "department": department,
            #                       "duration": duration, "location": location, "start_date": start_date,
            #                       "tuition_fee": tuition_fee, "overview_en": overview_en, "career_en": career_en,
            #                       "modules_en": modules_en, "rntry_requirements_en": rntry_requirements_en,
            #                       "ielts_desc": ielts_desc, "apply_desc_en": apply_desc_en}
            programme_dict_all = self.parse_data(informationDict, item)
            item['degree_name'] = programme_dict_all.get('degree_name')
            item['programme_en'] = programme_dict_all.get('programme_en')
            item['department'] = programme_dict_all.get('department')
            item['duration'] = programme_dict_all.get('duration')
            item["location"] = programme_dict_all.get("location")
            item['start_date'] = programme_dict_all.get('start_date')
            item["tuition_fee"] = programme_dict_all.get("tuition_fee")
            item["degree_overview_en"] = programme_dict_all.get("overview_en")

            item['career_en'] = programme_dict_all.get('career_en')
            item["modules_en"] = programme_dict_all.get("modules_en")
            item["rntry_requirements_en"] = programme_dict_all.get(
                "rntry_requirements_en")

            item["ielts_desc"] = programme_dict_all.get("ielts_desc")
            item["apply_desc_en"] = programme_dict_all.get("apply_desc_en")
            item["apply_documents_en"] = item['apply_desc_en']

            ieltsDict = get_ielts(item['ielts_desc'])
            item['ielts'] = ieltsDict.get("IELTS")
            item['ielts_l'] = ieltsDict.get("IELTS_L")
            item['ielts_s'] = ieltsDict.get("IELTS_S")
            item['ielts_r'] = ieltsDict.get("IELTS_R")
            item['ielts_w'] = ieltsDict.get("IELTS_W")
            print(
                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            # 为了拼接专业链接，取出链接前缀
            urlPre = re.findall(
                r"https://api.coursefinder.mq.edu.au/api/2019/international/[a-z]+/",
                response.url)
            urlPre = ''.join(urlPre)
            print("api接口前缀： ", urlPre)

            # 学位类型
            degreetype = informationDict.get("name")
            if degreetype is None:
                degreetype = ""
            # relateMajor = informationDict.get("handbook_detail_data").get("QualifyingMajors")
            relateMajor = informationDict.get("course_finder_data").get(
                "majors_specs").get('content').get("override")
            # print("relateMajor: ", relateMajor)
            if relateMajor is None or len(relateMajor) == 0:
                # self.parse_data(informationDict, item)
                yield item
            else:
                print("dididididi-----didididiiddididi")
                # major_list = relateMajor.get(item['degree_name'])
                # print("major_list:=== ", relateMajor)

                # if major_list is not None:
                # 拼接专业链接，加入start_urls
                for major in relateMajor:
                    slug = major.get("slug")
                    if slug is not None:
                        # slug_s = slug.lower().replace("-", "").split(" ")
                        # clear_space(slug_s)
                        # print("slug_s: ", slug_s)

                        # slug_str = '-'.join(slug_s).strip()
                        try:
                            majorApiUrl = response.url.replace(
                                "?studentType=international",
                                "") + slug + "?studentType=international"
                            print("***专业api链接", majorApiUrl)
                            informationUrl = majorApiUrl.replace(
                                "https://api.coursefinder.mq.edu.au/api",
                                "http://courses.mq.edu.au").replace(
                                    '?studentType=international",', '",')
                            print("------专业链接------", informationUrl)
                            item['url'] = informationUrl

                            data = requests.get(majorApiUrl,
                                                headers=self.headers_base)
                            informationDict1 = json.loads(data.text)
                            # print(informationDict)

                            major_dict_all = self.parse_data(
                                informationDict1, item)
                            programme_en = major_dict_all.get('degree_name')
                            # print("item['programme_en']_major1: ", item['programme_en'])

                            programme_major_re = re.findall(
                                r"in\s.*", programme_en)
                            if len(programme_major_re) > 0:
                                item['programme_en'] = ''.join(
                                    programme_major_re).strip().strip(
                                        "in").strip()
                            else:
                                item['programme_en'] = programme_en.replace(
                                    "Bachelor of", "").strip()
                            print("item['programme_en']_major: ",
                                  item['programme_en'])

                            item['overview_en'] = major_dict_all.get(
                                "overview_en")
                            # print("item['overview_en']_major: ", item['overview_en'])
                        except Exception as e:
                            print("专业报错： ***")
                        yield item

                # self.start_urls.append(majorApiUrl)
            # self.parse_data(informationDict, item)
            # yield item
        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: RMITUniversity_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "RMIT University"
        # item['country'] = 'Australia'
        # item['website'] = 'https://www.rmit.edu.au'
        item['url'] = response.url
        item['degree_type'] = 1
        item['major_type1'] = response.meta.get(response.url)
        print("===========================")
        print(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            programme = response.xpath(
                "//h1[@id='course-name']//text()|//h1[@class='highLight program-header']//text()"
            ).extract()
            clear_space(programme)
            item['degree_name'] = ''.join(programme).strip()
            if item['degree_name'] == "":
                print("***degree_name为空")
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Bachelor", item['degree_name'])
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2:
                programme_re = re.findall(
                    r"\(.+\)", item['degree_name'].replace("(Honours)", ""))
                if len(programme_re) > 0:
                    item['programme_en'] = ''.join(programme_re).replace(
                        "(", "").replace(")", "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace(
                        "Bachelor of", "").strip()
                print("item['programme_en']: ", item['programme_en'])

                location = response.xpath(
                    "//span[@class='icon-location']/..//text()|"
                    "//h4[@class='description'][contains(text(),'Location')]/following-sibling::*//text()"
                ).extract()
                clear_space(location)
                item['location'] = ' '.join(location).strip()
                if item['location'] == "":
                    print("***location为空")
                print("item['location']: ", item['location'])

                duration = response.xpath(
                    "//div[@class='b-program-content links b-international']//span[@class='icon-clock']/..//text()|"
                    "//div[@class='b-program-content links b-international  ']//span[@class='icon-clock']/..//text()|"
                    "//div[contains(@class,'box b-international not-hide col-xs-12')]//h4[@class='description'][contains(text(),'Duration')]/following-sibling::*//text()"
                ).extract()
                clear_space(duration)
                item['duration'] = ''.join(duration).strip()
                # if item['duration'] == "":
                #     print("***duration为空")
                # print("item['duration']: ", item['duration'])

                tuition_fee = response.xpath(
                    "//div[contains(@class,'b-program-content links b-international')]//span[@class='icon-fees']/..//text()|"
                    "//div[contains(@class,'b-program-content links b-international  ')]//span[@class='icon-fees']/..//text()|"
                    "//div[contains(@class,'box b-international not-hide col-xs-12')]//h4[@class='description'][contains(text(),'Fees')]/following-sibling::*//text()"
                ).extract()
                clear_space(tuition_fee)
                tuition_fee = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee'] = tuition_fee
                if item['tuition_fee'] == 0:
                    item['tuition_fee'] = None
                # print("item['tuition_fee']: ", item['tuition_fee'])

                start_date = response.xpath(
                    "//div[@class='b-program-content links b-international']//span[@class='icon-intake']/..//text()|"
                    "//div[@class='b-program-content links b-international  ']//span[@class='icon-intake']/..//text()|"
                    "//div[contains(@class,'box b-international not-hide col-xs-12')]//h4[@class='description'][contains(text(),'Next intake')]/following-sibling::*//text()|"
                    "//div[contains(@class,'box b-international not-hide col-xs-12')]//h4[@class='description'][contains(text(),'Next Intake')]/following-sibling::*//text()"
                ).extract()
                clear_space(start_date)
                item['start_date'] = getStartDateMonth(' '.join(start_date))
                if item['start_date'] == "":
                    print("***start_date 为空")
                print("item['start_date']: ", item['start_date'])

                overview = response.xpath(
                    "//div[@id='overview']/..|//div[@id='overview']/../following-sibling::div[1]|"
                    "//div[@id='Overview']/..|//div[@id='Overview']/../following-sibling::div[1]"
                ).extract()
                item['degree_overview_en'] = remove_class(
                    clear_lianxu_space(overview))

                modules_en_url = response.xpath(
                    "//table[@class='table  program-table']//td//a[contains(text(),'View plan')]/@href"
                ).extract()
                clear_space(modules_en_url)
                if len(modules_en_url) > 0:
                    url = "https://www.rmit.edu.au" + modules_en_url[0]
                    self.parse_modules1(url, item)
                else:
                    modules_en = response.xpath(
                        "//span[contains(text(),'Electives and program structure')]/../../../.."
                    ).extract()
                    item['modules_en'] = remove_class(
                        clear_lianxu_space(modules_en))

                if item['degree_overview_en'] == "":
                    overviewModulesUrl = response.url + "/program-details"
                    self.parse_overviewModules1(overviewModulesUrl, item)

                if item['degree_overview_en'] == "":
                    print("***degree_overview_en 为空")
                print("item['degree_overview_en']: ",
                      item['degree_overview_en'])
                if item['modules_en'] == "":
                    print("***modules_en 为空")
                print("item['modules_en']: ", item['modules_en'])

                career = response.xpath(
                    "//div[@id='career']|//div[@id='career']/../following-sibling::div[1]|"
                    "//div[@id=' career']|//div[@id=' career']/../following-sibling::div[1]|"
                    "//div[@id='Career']|//div[@id='Career']/../following-sibling::div[1]|"
                    "//div[@id=' Career']|//div[@id=' Career']/../following-sibling::div[1]"
                ).extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                if item['career_en'] == "":
                    careerUrl = response.url + "/career"
                    self.parse_career1(careerUrl, item)
                if item['career_en'] == "":
                    print("***career_en 为空")
                print("item['career_en']: ", item['career_en'])

                rntry_requirements_en = response.xpath(
                    "//div[@id='admissions']/..|//div[@id='admissions']/../following-sibling::*[position()<last()-3]|"
                    "//div[@id='Admissions']/..|//div[@id='Admissions']/../following-sibling::*[position()<last()-3]"
                ).extract()
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space(rntry_requirements_en))
                if item['rntry_requirements_en'] == "":
                    entryUrl = response.url + "/entry-requirements"
                    self.parse_entryrequirements1(entryUrl, item)
                if item['rntry_requirements_en'] == "":
                    print("***rntry_requirements_en 为空")
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                ielts_desc = response.xpath(
                    "//li[contains(text(),'IELTS (Academic): ')]//text()"
                ).extract()
                item['ielts_desc'] += clear_lianxu_space(ielts_desc)
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ielts_d = get_ielts(item['ielts_desc'])
                item["ielts"] = ielts_d.get('IELTS')
                item["ielts_l"] = ielts_d.get('IELTS_L')
                item["ielts_s"] = ielts_d.get('IELTS_S')
                item["ielts_r"] = ielts_d.get('IELTS_R')
                item["ielts_w"] = ielts_d.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                toefl_desc = response.xpath(
                    "//*[contains(text(),'TOEFL (Internet Based Test - IBT): ')]//text()"
                ).extract()
                item['toefl_desc'] += clear_lianxu_space(toefl_desc)
                # print("item['toefl_desc']: ", item['toefl_desc'])

                ielts_d = get_toefl(item['toefl_desc'])
                item["toefl"] = ielts_d.get('TOEFL')
                item["toefl_l"] = ielts_d.get('TOEFL_L')
                item["toefl_s"] = ielts_d.get('TOEFL_S')
                item["toefl_r"] = ielts_d.get('TOEFL_R')
                item["toefl_w"] = ielts_d.get('TOEFL_W')
                # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
                #         item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))
                # programme = response.xpath("//div[@class='program-name']/h1/text()").extract()
                # ucascode = response.xpath("//html//div[@class='c-summary c-summary-2-col mb-lg-lg-lg clearfix']/div[1]/span[2]/text()").extract()
                # clear_space(ucascode)
                # # item['ucas_code'] = ''.join(ucascode)
                # # print("item['ucas_code']2: ", item['ucas_code'])
                #
                # duration = response.xpath(
                #     "//div[@data-duration][2]/span[2]/text()").extract()
                # clear_space(duration)
                # item['duration'] = ''.join(duration)
                # print("item['duration']2: ", item['duration'])
                #
                # start_date = response.xpath(
                #     "//div[@data-intake][2]/span[2]/text()").extract()
                # clear_space(start_date)
                # item['start_date'] = ''.join(start_date)
                # print("item['start_date']2: ", item['start_date'])
                #
                # location = response.xpath(
                #     "//div[@class='c-summary-cell not-hide']/span[2]//text()").extract()
                # clear_space(location)
                # item['location'] = ''.join(location)
                # print("item['location']2: ", item['location'])
                #
                # department = response.xpath(
                #     "//html//div[@class='c-summary c-summary-2-col mb-lg-lg-lg clearfix']/div[7]/span[2]/text()").extract()
                # clear_space(department)
                # item['department'] = ''.join(department)
                # print("item['department']2: ", item['department'])
                #
                # overview = response.xpath(
                #     "//html//div[@class='program-summary-section-overview mb-md-md-md']/div[position()<last()-1]").extract()
                # item['degree_overview_en'] = remove_class(clear_lianxu_space(overview))
                # print("item['degree_overview_en']2: ", item['degree_overview_en'])
                #
                #
                # # //html//div[@class='panel-group accordion']/div/div[4]
                # career = response.xpath(
                #     "//html//div[@class='panel-group accordion']/div/div[@class='panel panel-default Yes'][3]").extract()
                # if "Career outlook" not in career:
                #     career = response.xpath(
                #     "//html//div[@class='panel-group accordion']/div/div[@class='panel panel-default Yes'][4]").extract()
                # item['career_en'] = remove_class(clear_lianxu_space(career))
                # print("item['career_en']2: ", item['career_en'])
                #
                # modulesUrl = response.url + "/program-structure"
                # self.parse_modules2(modulesUrl, item)
                #
                # how_to_applyUrl = response.url + "/how-to-apply"
                # self.parse_how_to_apply2(how_to_applyUrl, item)
                #
                # entryUrl = response.url + "/entry-requirements"
                # self.entryrequirements2(entryUrl, item)
                #
                # feeUrl = response.url + "/fees"
                # self.fees2(feeUrl, item)

                item['apply_proces_en'] = remove_class(
                    clear_lianxu_space([
                        """<div class="share-heading hide">How to Apply</div>
  </div>
                </div>
			<div class="standard-content-article mb-lg-md-md clearfix">
				<div class="org-area-module-detail-view accordian ">
					<div class="row">
						<div class="col-xs-12 ">
							<div class="clearfix">
  <p class="lead">A step-by-step guide for international students on how to apply to study at RMIT.</p>
  <div class="lower-image-container"></div>
							</div>
							<!-- Parsys -->
							<!-- This Parsys will be used to Put all Main Body Components -->
<div class="floated-image-container pull-right">
<div class="detail-img-list not-hide image-square">
	<figure>
		<div class="c-detail-image c-detail-image-square">
			<img data-ri-xxs="/content/dam/rmit/rmit-images/life-at-rmit/Study-modes_EVE-800x800.jpg.transform/rendition-800x800/image.jpg" data-ri-sm="/content/dam/rmit/rmit-images/life-at-rmit/Study-modes_EVE-800x800.jpg.transform/rendition-640x640/image.jpg" class="c-responsive-image bg-cover offset-content">
		</div>
		<div class="c-detail-image c-detail-image-portrait">
			<img data-ri-xxs="/content/dam/rmit/rmit-images/life-at-rmit/Study-modes_EVE-800x800.jpg.transform/rendition-800x1068/image.jpg" data-ri-sm="/content/dam/rmit/rmit-images/life-at-rmit/Study-modes_EVE-800x800.jpg.transform/rendition-640x854/image.jpg" class="c-responsive-image bg-cover offset-content">
		</div>
	</figure>
</div>
</div>
<div>
    <div class="extended-desc not-hidden">
        <p>If you want to study for only one or two semesters, you can apply for a&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/programs-for-international-students/study-abroad-and-exchange/study-abroad.html">study abroad program</a>&nbsp;or&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/programs-for-international-students/study-abroad-and-exchange/student-exchange.html">student exchange</a>&nbsp;at RMIT.</p>
<h3>Applying for a research degree?</h3>
<p>If you want to apply for a research program, <a href="/content/rmit-ui/en/research/phds-and-other-research-degrees/how-to-apply.html">follow this process and apply here</a> instead.<br>
</p>
<h2>Step 1: Find a program</h2>
<p>Search for a program in your&nbsp;<a href="/content/rmit-ui/en/study-with-us.html">interest area</a>&nbsp;or browse by&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/programs-for-international-students.html">level of study</a>. Some programs are not available in the July intake, in which case, you will need to apply for the next available intake.</p>
<p>You can also use the&nbsp;<a href="https://www.international.rmit.edu.au/info/programfees.asp" title="Programs, intakes and tuition fees database">Programs, intakes and tuition fees database</a>&nbsp;to search for programs.</p>
<h2>Step 2: Check the entry requirements</h2>
<p>Check that you qualify for the program's entry requirements including:</p>
<ul>
<li>English language requirements</li>
<li>academic entry requirement (see equivalent&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/apply-to-rmit-international-students/entry-requirements/country-equivalency.html">entry requirements by country</a>)</li>
<li>pre-requisites</li>
<li>selection tasks.</li>
</ul>
<p>If you don’t meet the entry requirements for your preferred program, you can consider a range of programs that may provide&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/apply-to-rmit-international-students/pathways-and-credit-transfer.html">pathways</a>&nbsp;to your preferred program.</p>
<p>If you are&nbsp;currently studying an Australian Year 12 (in Australia or overseas) or International Baccalaureate (in Australia or New Zealand) and applying &nbsp;for a&nbsp;Bachelor, Associate or Honours degree, you will need to apply via VTAC. You should <a href="http://www.rmit.edu.au/study-with-us/international-students/apply-to-rmit-international-students/how-to-apply/international-students-studying-vce-or-ib">check the VTAC entry requirements</a>.<br>
</p>
<h2>Step 3: Collect required documents</h2>
<p>To avoid delays in admission processing, submit&nbsp;a&nbsp;complete set of supporting documents&nbsp;including:</p>
<ul>
<li>passport</li>
<li>certified copies of academic transcripts&nbsp;&nbsp;(not required for current RMIT students applying to another RMIT program)</li>
<li>certified copies of all graduation certificates in both the original&nbsp;language and English&nbsp;&nbsp;(not required for current RMIT students applying to another RMIT program)</li>
<li>evidence of English language proficiency&nbsp;&nbsp;(not required for current RMIT students applying to another RMIT program)</li>
<li>any documentation relating to selection tasks (pre-selection kits,&nbsp;folios etc.)</li>
<li>CV, work reference letter, referee report etc if applicable</li>
</ul>
<p>&nbsp;Please note that documents submitted will not be returned.</p>
<h2>Step 4: Submit your application</h2>
<p>Submit your&nbsp;application online&nbsp;with&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/apply-to-rmit-international-students/how-to-apply/documentation-required.html">all the required documents</a>.</p>
<h4>Students completing an Australian Year 12 (in Australia or overseas), or the International Baccalaureate (in Australia or New Zealand)</h4>
<ul>
<li>Apply for <strong>Higher Education</strong> programs (Bachelor, Associate Degree and Honours) through the Victorian Tertiary Admissions Centre (VTAC).<br>
<br>
<a href="http://www.vtac.edu.au/applying.html">Apply now via VTAC</a></li>
<li>Apply for <strong>Vocational Education</strong> programs (Foundation Studies, ELICOS, VCE, Certificate IV, Diploma and Advanced Diplomas) via iApply, the online application system for international students.<br>
<br>
<a href="https://iapply.rmit.edu.au/sitsvision/wrd/SIW_LGN">Apply now via iApply</a></li>
</ul>
<h4>Studying fully online<br>
</h4>
<ul>
<li>If your program is delivered fully online, use the online application system for local students and follow the local student application process. Note: fully online programs do not qualify for an Australian Student Visa.<br>
<br>
<a href="https://rmit.service-now.com/rmit-admissions/">Apply now via Admissions</a></li>
</ul>
<h4>All other international students<br>
</h4>
<ul>
<li>If you are applying for on-campus study in a coursework program use iApply, the online application system for international students.<br>
<br>
<a href="https://iapply.rmit.edu.au/sitsvision/wrd/SIW_LGN">Apply now via iApply</a></li>
</ul>
<h4>Application fee</h4>
<p>You will need to pay an application fee if you are from one of these&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/apply-to-rmit-international-students/how-to-apply/application-fee.html">countries classified as high risk</a>.</p>
<h2>Need help?</h2>
<p>If you need assistance,&nbsp;<a href="https://connect.prospectivestudent.info/RMITInt?_ga=1.241036611.1742672422.1416265787">contact us</a>&nbsp;or one of&nbsp;<a href="https://www.international.rmit.edu.au/info/agentlist/">RMIT’s appointed representatives</a>&nbsp;(agents).</p>
<h2>Next steps:</h2>
<p>Your application will be assessed in line with RMIT’s policies and procedures. If you are successful, you will receive an offer letter. You can then&nbsp;<a href="/content/rmit-ui/en/study-with-us/international-students/apply-to-rmit-international-students/accept-your-offer.html">accept your offer</a>&nbsp;by following the instructions in your offer letter.&nbsp;</p>
<p>RMIT will normally advise you on the outcome of your application within 10 business days. If you are applying from Australia you should hear within 24 hours. If you don't hear back within the time frame above please <a href="https://rmit.au1.qualtrics.com/jfe/form/SV_0fbt3k9dEkNATZ3">contact Admissions Helpdesk</a>.</p>
<p>If you are applying via VTAC <a href="http://www.vtac.edu.au/dates.html">check the VTAC website</a> for important dates.</p>
    </div>
</div>
						</div>
					</div>"""
                    ]))
                item['apply_documents_en'] = remove_class(
                    clear_lianxu_space([
                        """<p>To avoid delays in admission processing, submit&nbsp;a&nbsp;complete set of supporting documents&nbsp;including:</p>
<ul>
<li>passport</li>
<li>certified copies of academic transcripts&nbsp;&nbsp;(not required for current RMIT students applying to another RMIT program)</li>
<li>certified copies of all graduation certificates in both the original&nbsp;language and English&nbsp;&nbsp;(not required for current RMIT students applying to another RMIT program)</li>
<li>evidence of English language proficiency&nbsp;&nbsp;(not required for current RMIT students applying to another RMIT program)</li>
<li>any documentation relating to selection tasks (pre-selection kits,&nbsp;folios etc.)</li>
<li>CV, work reference letter, referee report etc if applicable</li>
</ul>
<p>&nbsp;Please note that documents submitted will not be returned.</p>"""
                    ]))
                yield item
        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 4

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "University of Canberra"
        # item['country'] = 'Australia'
        # item['website'] = 'https://www.vu.edu.au/'
        item['url'] = response.url
        item['degree_type'] = 1
        print("===========================")
        print(response.url)

        # item['major_type1'] = programme_dict.get(response.url)
        # print("item['major_type1']: ", item['major_type1'])
        try:
            # //h1[@class='page-header']
            programme = response.xpath(
                "//h1[@class='course_title']//text()").extract()
            clear_space(programme)
            degree_name_str = ''.join(programme).strip()
            degree_name_re = re.findall(r"-.*", degree_name_str)
            item['degree_name'] = degree_name_str.replace(
                ''.join(degree_name_re), '').strip()
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Bachelor", item['degree_name'])
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2 and "online" not in item['degree_name'].lower():
                programme_re = re.findall(r"\(.+\)", item['degree_name'])
                if len(programme_re) > 0:
                    item['programme_en'] = ''.join(programme_re).replace(
                        "(", "").replace(")", "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace(
                        "Bachelor of", "").strip()
                print("item['programme_en']: ", item['programme_en'])

                location = response.xpath(
                    "//th[contains(text(),'Location:')]/following-sibling::td//text()"
                ).extract()
                clear_space(location)
                item['location'] = ''.join(location).strip()
                # print("item['location']: ", item['location'])

                department = response.xpath(
                    "//th[contains(text(),'Faculty:')]/following-sibling::td//text()"
                ).extract()
                clear_space(department)
                item['department'] = ''.join(department).strip()
                # print("item['department']: ", item['department'])

                ielts_desc_re = response.xpath(
                    "//th[contains(text(),'English Language Requirements:')]/following-sibling::td//text()"
                ).extract()
                item['ielts_desc'] = ''.join(ielts_desc_re).strip()
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ieltlsrw = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
                # print(ieltlsrw)
                if len(ieltlsrw) == 1:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[0]
                    item["ielts_s"] = ieltlsrw[0]
                    item["ielts_r"] = ieltlsrw[0]
                    item["ielts_w"] = ieltlsrw[0]
                elif len(ieltlsrw) == 2:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[1]
                    item["ielts_s"] = ieltlsrw[1]
                    item["ielts_r"] = ieltlsrw[1]
                    item["ielts_w"] = ieltlsrw[1]
                elif len(ieltlsrw) == 5:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[1]
                    item["ielts_s"] = ieltlsrw[4]
                    item["ielts_r"] = ieltlsrw[2]
                    item["ielts_w"] = ieltlsrw[3]
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                tuition_fee = response.xpath(
                    "//div[@id='fees']//tr[2]/td[3]//text()").extract()
                clear_space(tuition_fee)
                # print("tuition_fee: ", tuition_fee)
                tuition_fee_str = ''.join(tuition_fee).strip()
                tuition_fee_re = re.findall(r"\d+,\d+", tuition_fee_str)
                item['tuition_fee'] = ''.join(tuition_fee_re).replace(
                    ",", "").strip()
                # print("item['tuition_fee']: ", item['tuition_fee'])

                overview = response.xpath(
                    "//h2[contains(text(),'Career opportunities')]/preceding-sibling::*"
                ).extract()
                if len(overview) == 0:
                    overview = response.xpath(
                        "//div[@class='collapsible-section']/preceding-sibling::*"
                    ).extract()
                item['overview_en'] = item[
                    'degree_overview_en'] = remove_class(
                        clear_lianxu_space(overview))
                # print("item['degree_overview_en']: ", item['degree_overview_en'])

                career = response.xpath(
                    "//h2[contains(text(),'Career opportunities')]|//h2[contains(text(),'Career opportunities')]/following-sibling::*[1]|"
                    "//strong[contains(text(),'Career opportunities')]/..|//strong[contains(text(),'Career opportunities')]/../following-sibling::*[position()<3]"
                ).extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # if item['career_en'] == "":
                #     print("***career_en 为空")
                # print("item['career_en']: ", item['career_en'])

                modules = response.xpath(
                    "//h2[contains(text(),'Course Requirements')]|//div[@id='toggle-view']"
                ).extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # if item['modules_en'] == "":
                #     print("***modules_en 为空")
                # print("item['modules_en']: ", item['modules_en'])

                entry_requirements = response.xpath(
                    "//div[@id='admission']").extract()
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space(entry_requirements))
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                # how_to_apply = response.xpath(
                #     "//div[@id='apply-now']").extract()
                # item['apply_desc_en'] = remove_class(clear_lianxu_space(how_to_apply))
                # # print("item['apply_desc_en']: ", item['apply_desc_en'])

                yield item
        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: CharlesSturtUniversity_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "Charles Sturt University"
        # item['country'] = 'Australia'
        # item['website'] = 'http://futurestudents.csu.edu.au'
        item['url'] = response.url
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        driver = webdriver.Chrome(r"C:\Users\admin\AppData\Local\Programs\Python\Python36\Lib\site-packages\selenium\chromedriver (2).exe")
        driver.implicitly_wait(30)  # 隐式等待
        driver.get(response.url)
        import time
        # time.sleep(30)

        try:
            locator = (By.ID, 'cYear-campus')
            WebDriverWait(driver, 20, 0.5).until(EC.presence_of_element_located(locator))
            location = driver.find_element_by_xpath(r"//div[@id='cYear-campus']").text
            print("location: ", location)
            # location = response.xpath(
            #     "//div[@id='locations1']//div[@class='section no-padding-top']//div[@class='card card-content z-depth-0']//div[@class='is-domestic']//text()|"
            #     "//div[@id='fYear-campus']//text()").extract()
            # clear_space(location)
            item['location'] = location
            print("item['location']: ", item['location'])

            programme = response.xpath(
                "//h1[@class='logo-font csu-slogan course course-name']//text()").extract()
            clear_space(programme)
            programme = ''.join(programme).replace("(with specialisations)", "").replace("  ", "").strip()
            item['degree_name'] = programme
            key_fee = item['degree_name']
            print("item['degree_name']: ", item['degree_name'])

            programme_re = re.findall(r"\(.+\)", item['degree_name'].replace("(Honours)", ""))
            if len(programme_re) > 0:
                item['programme_en'] = ''.join(programme_re).replace("(", "").replace(")", "").strip()
                item['degree_name'] = item['degree_name'].replace(''.join(programme_re), "").strip()
            else:
                item['programme_en'] = programme.replace("Bachelor of", "").strip()
            print("item['programme_en']: ", item['programme_en'])
            print("***item['degree_name']: ", item['degree_name'])

            count_un = re.findall(r"Bachelor", item['degree_name'])
            print("count_un: ", count_un)
            if len(count_un) < 2:
                degree_overview_en = response.xpath(
                    "//div[@id='course-full']/preceding-sibling::p").extract()
                item['overview_en'] = item['degree_overview_en'] = remove_class(clear_lianxu_space(degree_overview_en))
                print("item['degree_overview_en']: ", item['degree_overview_en'])

                locator = (By.CLASS_NAME, 'sectionHead')
                WebDriverWait(driver, 20, 0.5).until(EC.presence_of_element_located(locator))
                duration = driver.find_element_by_xpath(r"//div[@id='cYear-duration']").text
                # duration = response.xpath(
                #     "//div[@id='ocbDuration']//text()").extract()
                # clear_space(duration)
                # item['duration'] = ','.join(duration).replace(",,", ",").replace(":,", ":").replace(",:", ":").strip().strip(",").strip()
                item['duration'] = duration
                print("item['duration']: ", item['duration'])


                start_date = driver.find_element_by_xpath(r"//div[@id='cYear-sessions']").text
                # start_date = response.xpath(
                #     "//div[@id='sessDatesKI']/span/text() | //div[@id='sessDateDom']/span/text()").extract()
                # clear_space(start_date)
                # print("start_date: ", start_date)
                # if len(start_date) > 0:
                #     start_date_str = start_date[0].strip()
                #     if ";" in start_date_str:
                #         start_date_list = start_date_str.split(";")
                #         st_l = []
                #         for s in start_date_list:
                #             s1 = s.replace("2018", "").replace("2019", "").replace("0", "").strip()
                #             st_l.append(s1)
                #         st_l = list(set(st_l))
                #         item['start_date'] = ','.join(st_l).strip().strip(",").strip()
                #     else:
                #         item['start_date'] = start_date_str.replace("2018", "").replace("2019", "").replace("0", "").strip()
                item['start_date'] = start_date
                print("item['start_date']: ", item['start_date'])

                career = response.xpath(
                    "//div[@class='hasCareerOpps']|//div[@class='section isPostGrad isHDR']").extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                print("item['career_en']: ", item['career_en'])

                # 显示等待，出现id为subject-div的元素结束等待
                locator = (By.ID, 'subject-div')
                WebDriverWait(driver, 30, 0.5).until(EC.presence_of_element_located(locator))
                # //div[@id='testimonial-area']/following-sibling::div[1]
                modules = driver.find_element_by_xpath(r"//div[@id='subject-div']").get_attribute('innerHTML')
                # modules = response.xpath(
                #     "//div[@id='subject-intro']").extract()
                item['modules_en'] = remove_class(modules)
                # print("item['modules_en']: ", item['modules_en'])

                rntry_requirements_en = driver.find_element_by_xpath(r"//div[@id='detailCardTeam1']").get_attribute('innerHTML')
                # rntry_requirements_en = response.xpath(
                #     "//h3[contains(text(),'Entry requirements')]/..").extract()
                item['rntry_requirements_en'] = remove_class(rntry_requirements_en)
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                ielts_desc_dict = {"Associate Degree in Policing Practice": "A minimum overall Academic IELTS score of 7.0 with no score below a 7.0 in each of the individual skill areas or a qualification deemed equivalent.",
    "Bachelor of Medical Radiation Science": "A minimum overall Academic IELTS score of 6.5 with no score below a 6.0 in each of the individual skill areas or a qualification deemed equivalent.",
    "Bachelor of Theology": "A minimum overall Academic IELTS score of 6.5 with no score below a 6.0 in each of the individual skill areas or a qualification deemed equivalent.",
    "Bachelor of Health and Rehabilitation Science": "A minimum overall Academic IELTS score IELTS of 6.5 with no score below a 6.5 in each of the individual skill areas or a qualification deemed equivalent.",
    "Bachelor of Occupational Therapy": "A minimum overall Academic IELTS score IELTS of 6.5 with no score below a 6.5 in each of the individual skill areas or a qualification deemed equivalent.",
    "Bachelor of Physiotherapy": "A minimum overall Academic IELTS score IELTS of 6.5 with no score below a 6.5 in each of the individual skill areas or a qualification deemed equivalent.",
    "Bachelor of Podiatric Medicine": "A minimum overall Academic IELTS score IELTS of 6.5 with no score below a 6.5 in each of the individual skill areas or a qualification deemed equivalent.",
    "Bachelor of Clinical Practice (Paramedic)": "A minimum overall Academic IELTS score of 7.0 with no score below a 6.5 in each of the individual skill areas or a qualification deemed equivalent.",
    "Bachelor of Dental Science": "A minimum overall Academic IELTS score of 7.0 with no score below a 6.5 in each of the individual skill areas or a qualification deemed equivalent.",
    "Bachelor of Pharmacy": "A minimum overall Academic IELTS score of 7.0 with no score below a 6.5 in each of the individual skill areas or a qualification deemed equivalent.",
    "Bachelor of Oral Health (Therapy / Hygiene)": "A minimum overall Academic IELTS score of 7.0 with no score below a 6.5 in each of the individual skill areas or a qualification deemed equivalent.",
    "Bachelor of Speech and Language Pathology": "A minimum overall Academic IELTS score of 7.0 with no score below a 6.5 in each of the individual skill areas or a qualification deemed equivalent.",
    "Bachelor of Education (Birth to Five)": "A minimum overall Academic IELTS score of 7.5 (with no score below 7 in reading and writing, and a score of no less than 8 in speaking and listening) or a qualification deemed equivalent. Testing results must be obtained within two years from the date of your application for admission.",
    "Bachelor of Education (Early Childhood and Primary)": "A minimum overall Academic IELTS score of 7.5 (with no score below 7 in reading and writing, and a score of no less than 8 in speaking and listening) or a qualification deemed equivalent. Testing results must be obtained within two years from the date of your application for admission.",
    "Bachelor of Education (Health and Physical Education)": "A minimum overall Academic IELTS score of 7.5 (with no score below 7 in reading and writing, and a score of no less than 8 in speaking and listening) or a qualification deemed equivalent. Testing results must be obtained within two years from the date of your application for admission.",
    "Bachelor of Education (K-12)": "A minimum overall Academic IELTS score of 7.5 (with no score below 7 in reading and writing, and a score of no less than 8 in speaking and listening) or a qualification deemed equivalent. Testing results must be obtained within two years from the date of your application for admission.",
    "Bachelor of Education (Secondary) - Industry entry": "A minimum overall Academic IELTS score of 7.5 (with no score below 7 in reading and writing, and a score of no less than 8 in speaking and listening) or a qualification deemed equivalent. Testing results must be obtained within two years from the date of your application for admission.",
    "Bachelor of Education (Technology and Applied Studies)": "A minimum overall Academic IELTS score of 7.5 (with no score below 7 in reading and writing, and a score of no less than 8 in speaking and listening) or a qualification deemed equivalent. Testing results must be obtained within two years from the date of your application for admission.",
    "Bachelor of Teaching (Primary)": "A minimum overall Academic IELTS score of 7.5 (with no score below 7 in reading and writing, and a score of no less than 8 in speaking and listening) or a qualification deemed equivalent. Testing results must be obtained within two years from the date of your application for admission.",
    "Bachelor of Teaching (Secondary)": "A minimum overall Academic IELTS score of 7.5 (with no score below 7 in reading and writing, and a score of no less than 8 in speaking and listening) or a qualification deemed equivalent. Testing results must be obtained within two years from the date of your application for admission.",
    "Bachelor of Nursing":"An Academic IELTS with a minimum overall score of 7 and with no score below 7 in each of the individual skill areas or a qualification deemed equivalent.",
    "Bachelor of Nursing - Graduate Diploma of Clinical Practice (Paramedic)":"An Academic IELTS with a minimum overall score of 7 and with no score below 7 in each of the individual skill areas or a qualification deemed equivalent.",}
                item['ielts_desc'] = ielts_desc_dict.get(key_fee)
                print("item['ielts_desc']: ", item['ielts_desc'])

                driver.quit()
                if item['ielts_desc'] is not None:
                    ielts_d = get_ielts(item['ielts_desc'])
                    item["ielts"] = ielts_d.get('IELTS')
                    item["ielts_l"] = ielts_d.get('IELTS_L')
                    item["ielts_s"] = ielts_d.get('IELTS_S')
                    item["ielts_r"] = ielts_d.get('IELTS_R')
                    item["ielts_w"] = ielts_d.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                department = response.xpath(
                    "//html//nav[@class='breadcrumb-wrapper']//a[3]//text()").extract()
                clear_space(department)
                item['department'] = ' '.join(department).strip()
                print("item['department']: ", item['department'])

                apply_desc_en = response.xpath(
                    "//div[@id='international-app']").extract()
                item['apply_desc_en'] = remove_class(clear_lianxu_space(apply_desc_en))
                print("item['apply_desc_en']: ", item['apply_desc_en'])

                deadline = response.xpath(
                    "//div[@class='card']//div[@class='card very-small-international-lower']//text()").extract()
                clear_space(deadline)
                print("deadline: ", deadline)
                deadline_str = ""
                if "Important dates" in deadline:
                    d = deadline.index("Important dates")
                    deadline_str += ''.join(deadline[d+1:d+2]) + " "
                item['deadline'] = getStartDate(deadline_str.strip())
                print("item['deadline']: ", item['deadline'])

                feeDict = {}
                zhuanye = ["Bachelor of Accounting",
"Bachelor of Accounting",
"Bachelor of Agricultural Business Management",
"Bachelor of Agricultural Science",
"Bachelor of Animal Science",
"Bachelor of Applied Science (Outdoor Recreation and Ecotourism)",
"Bachelor of Applied Science (Parks Recreation and Heritage)",
"Bachelor of Arts",
"Bachelor of Business (Human Resource Management)",
"Bachelor of Business (Management)",
"Bachelor of Business (Management)",
"Bachelor of Business (Marketing)",
"Bachelor of Business (Marketing)",
"Bachelor of Business Studies",
"Bachelor of Business Studies",
"Bachelor of Clinical Science",
"Bachelor of Communication (Advertising)",
"Bachelor of Communication (Journalism)",
"Bachelor of Communication (Public Relations)",
"Bachelor of Communication (Theatre Media)",
"Bachelor of Computer Science (with specialisation)",
"Bachelor of Computing (Honours)",
"Bachelor of Creative Arts and Design (Animation and Visual Effects)",
"Bachelor of Creative Arts and Design (Graphic Design / Photography)",
"Bachelor of Creative Arts and Design (Graphic Design)",
"Bachelor of Creative Arts and Design (Photography)",
"Bachelor of Criminal Justice",
"Bachelor of Criminal Justice (Honours)",
"Bachelor of Dental Science",
"Bachelor of Education (Early Childhood and Primary)",
"Bachelor of Education (Health and Physical Education)",
"Bachelor of Education (K - 12)",
"Bachelor of Education (Technology and Applied Studies)",
"Bachelor of Environmental Science and Management",
"Bachelor of Equine Science (with specialisation)",
"Bachelor of Exercise and Sport Science",
"Bachelor of Exercise Science (Honours)",
"Bachelor of Health and Rehabilitation Science",
"Bachelor of Information Technology (with specialisations)",
"Bachelor of Information Technology (with specialisations)",
"Bachelor of Medical Radiation Science",
"Bachelor of Medical Science",
"Bachelor of Nursing",
"Bachelor of Occupational Therapy",
"Bachelor of Oral Health (Therapy - Hygiene)",
"Bachelor of Paramedicine",
"Bachelor of Pharmacy",
"Bachelor of Physiotherapy",
"Bachelor of Podiatric Medicine",
"Bachelor of Psychology",
"Bachelor of Science",
"Bachelor of Science (Honours)",
"Bachelor of Social Science (Psychology)",
"Bachelor of Social Science (Psychology) / Bachelor of Business (Management)",
"Bachelor of Social Science (Psychology) / Bachelor of Business (Marketing)",
"Bachelor of Social Work",
"Bachelor of Speech and Language Pathology",
"Bachelor of Stage and Screen (Television)",
"Bachelor of Theology",
"Bachelor of Theology (Honours)",
"Bachelor of Veterinary Biology / Bachelor of Veterinary Science", ]
                loco = ["Albury-Wodonga, Bathurst, Port Macquarie, Wagga Wagga",
"CSU Study Centre Melbourne, CSU Study Centre Sydney",
"Wagga Wagga",
"Wagga Wagga",
"Wagga Wagga",
"Albury-Wodonga, Port Macquarie",
"Albury-Wodonga, Port Macquarie",
"Bathurst, Wagga Wagga",
"CSU Study Centre Melbourne, CSU Study Centre Sydney",
"Albury-Wodonga, Bathurst, Wagga Wagga",
"CSU Study Centre Melbourne, CSU Study Centre Sydney",
"Albury-Wodonga, Bathurst",
"CSU Study Centre Melbourne, CSU Study Centre Sydney",
"Albury-Wodonga, Bathurst, Port Macquarie, Wagga Wagga",
"CSU Study Centre Melbourne, CSU Study Centre Sydney",
"Orange",
"Bathurst, Port Macquarie",
"Bathurst",
"Bathurst, Port Macquarie",
"Bathurst",
"Bathurst",
"Albury-Wodonga, Bathurst, Wagga Wagga",
"Wagga Wagga",
"Wagga Wagga",
"Port Macquarie",
"Wagga Wagga",
"Bathurst, Port Macquarie",
"Bathurst",
"Orange",
"Albury-Wodonga, Bathurst, Wagga Wagga",
"Bathurst",
"Albury-Wodonga, Bathurst, Wagga Wagga",
"Wagga Wagga",
"Albury-Wodonga, Port Macquarie",
"Wagga Wagga",
"Bathurst, Port Macquarie",
"Bathurst",
"Albury-Wodonga",
"Albury-Wodonga, Wagga Wagga",
"CSU Study Centre Melbourne, CSU Study Centre Sydney",
"Port Macquarie, Wagga Wagga",
"Wagga Wagga",
"Albury-Wodonga, Bathurst, Wagga Wagga",
"Albury-Wodonga, Port Macquarie",
"Wagga Wagga",
"Bathurst, Port Macquarie",
"Orange",
"Albury-Wodonga, Orange",
"Albury-Wodonga",
"Bathurst, Port Macquarie",
"Wagga Wagga",
"Albury-Wodonga, Orange, Wagga Wagga",
"Port Macquarie, Wagga Wagga",
"Bathurst",
"Bathurst",
"Port Macquarie, Wagga Wagga",
"Albury-Wodonga",
"Wagga Wagga",
"Canberra, United Theological College",
"Canberra, United Theological College",
"Wagga Wagga",]
                fee = ["23600",
"25200",
"28800",
"28800",
"28800",
"28800",
"28800",
"19200",
"25200",
"23600",
"25200",
"23600",
"25200",
"23600",
"25200",
"28800",
"23200",
"23200",
"21600",
"23200",
"23600",
"23600",
"22400",
"22400",
"22400",
"22400",
"21600",
"21600",
"54400",
"22080",
"22080",
"22080",
"22080",
"28800",
"28800",
"28800",
"28800",
"28800",
"23600",
"26560",
"29600",
"28800",
"27200",
"28800",
"34400",
"28800",
"28800",
"31200",
"28800",
"24000",
"28800",
"28800",
"24000",
"24000",
"24000",
"24160",
"28800",
"22400",
"18400",
"18400",
"54400", ]
                for i in range(len(zhuanye)):
                    feeDict[zhuanye[i]] = loco[i] + ":" + fee[i]
                item['tuition_fee'] = feeDict.get(key_fee.replace("(12 subjects)", "").replace("(16 subjects)", "").strip())
                print("item['tuition_fee']: ", item['tuition_fee'])

                online = response.xpath("//h2[contains(text(),'Study mode')]/following-sibling::*//text()").extract()
                clear_space(online)
                print("online: ", online)
                if ''.join(online).strip() != "Online":
                    major_list = response.xpath(
                        "//div[@id='fYear-specialisation']/ul[1]/li//text()|"
                        "//div[@id='fYear-specialisation']/h3//text()|"
                        "//div[@id='fYear-specialisation']/h6//text()").extract()
                    clear_space(major_list)
                    print("major_list: ", major_list)
                    print(len(major_list))

                    if len(major_list) == 0:
                        yield item
                    else:
                        for m in range(len(major_list)):
                            item['programme_en'] = major_list[m]
                            # item['modules_en'] = remove_class(clear_lianxu_space([modules_list[m]]))
                            print("item['programme_en']: ", item['programme_en'])
                            yield item
        except Exception as e:
            with open("scrapySchool_Australian_ben/error/"+item['university']+str(item['degree_type'])+".txt", 'a', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: TheUniversityOfNewSouthWales_U_handbook2019.py Projeto: histudent/python_spider

    def parse(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "The University of New South Wales"
        item['url'] = response.url
        item['degree_type'] = 1
        # item['teach_time'] = 'coursework'
        print("===========================")
        print(response.url)
        try:
            department = response.xpath("//li[3]//a//text()").extract()
            item['department'] = ''.join(department).strip()
            print("item['department']: ", item['department'])

            location = response.xpath(
                "//div[@role='complementary']//strong[@tabindex='0'][contains(text(),'Campus')]/../p//text()"
            ).extract()
            item['location'] = ''.join(location).strip()
            print("item['location']: ", item['location'])

            duration = response.xpath(
                "//div[contains(@role,'complementary')]//strong[contains(@tabindex,'0')][contains(text(),'Typical duration')]/../p//text()"
            ).extract()
            clear_space(duration)
            print("duration: ", duration)
            if "Years" in ''.join(duration):
                item['duration'] = ''.join(duration).replace("Years",
                                                             "").strip()
                item['duration_per'] = 1
            print("item['duration']: ", item['duration'])

            # //div[@id='readMoreToggle1']
            overview_en = response.xpath(
                "//div[@id='readMoreToggle1']/div[1]").extract()
            item['degree_overview_en'] = item['overview_en'] = remove_class(
                clear_lianxu_space(overview_en))
            print("item['overview_en']: ", item['overview_en'])

            item["rntry_requirements_en"] = None
            rntry_requirements_en = response.xpath(
                "//div[@class='m-accordion-group m-accordion-with-header']//div[@class='m-accordion-body']|"
                "//strong[@aria-label='Progression Requirements']/../../following-sibling::div"
            ).extract()
            if rntry_requirements_en:
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space(rntry_requirements_en))
            print("item['rntry_requirements_en']: ",
                  item['rntry_requirements_en'])

            modules_en = response.xpath(
                "//div[@id='structure']/div[position()<last()]").extract()
            if modules_en:
                item['modules_en'] = remove_class(
                    clear_lianxu_space(modules_en))
            print("item['modules_en']: ", item['modules_en'])

            career_en = response.xpath(
                "//strong[@aria-label='Career Opportunities']/../../following-sibling::div"
            ).extract()
            if career_en:
                item['career_en'] = remove_class(clear_lianxu_space(career_en))
            print("item['career_en']: ", item['career_en'])

            # start_date = response.xpath(
            #     "//section//dt[contains(text(), 'Entry')]/following-sibling::dd[1]//text()").extract()
            # clear_space(start_date)
            # print(len(start_date))
            # print("start_date: ", start_date)
            #
            # tuition_fee = response.xpath(
            #     "//section//dt[contains(text(), 'Estimated first year tuition')]/following-sibling::*[1]//text()").extract()
            # clear_space(tuition_fee)
            # print(len(tuition_fee))
            # print("tuition_fee: ", tuition_fee)

            # 学位类型列表
            degree_name = response.xpath(
                "//div[@role='complementary']//p[contains(text(),'Bachelor of')]/text()|"
                "//div[@role='complementary']//p[contains(text(),'Juris Doctor')]/text()"
            ).extract()
            clear_space(degree_name)
            if len(degree_name) > 0:
                item['degree_name'] = ', '.join(degree_name).replace(
                    "-", "").strip()
            else:
                item['degree_name'] = None
            print("item['degree_name']: ", item['degree_name'])

            programme_list = response.xpath(
                '//div[@data-hbui-filter-item="specialisation"]/a/div/p//text()|'
                '//h4[contains(text(),"Home Majors and Minors")]/../../following-sibling::div//div[@data-hbui-filter-item="major"]/a[contains(@href, "/undergraduate/specialisations/2019/")]/div[last()]/p//text()|'
                '//h4[contains(text(),"Business Majors")]/../../following-sibling::div//div[@data-hbui-filter-item="major"]/a[contains(@href, "/undergraduate/specialisations/2019/")]/div[last()]/p//text()|'
                '//h4[contains(text(),"Optional Minor or Second Major (International)")]/../../following-sibling::div//div[@data-hbui-filter-item="major"]/a[contains(@href, "/undergraduate/specialisations/2019/")]/div[last()]/p//text()|'
                '//h4[contains(text(),"Economics Majors")]/../../following-sibling::div//div[@data-hbui-filter-item="major"]/a[contains(@href, "/undergraduate/specialisations/2019/")]/div[last()]/p//text()|'
                '//h4[contains(text(),"Specialisation")]/../../following-sibling::div//div[@data-hbui-filter-item="major"]/a[contains(@href, "/undergraduate/specialisations/2019/")]/div[last()]/p//text()|'
                '//h4[contains(text(),"Specialisation")]/../../following-sibling::div//div[@data-hbui-filter-item="honours"]/a[contains(@href, "/undergraduate/specialisations/2019/")]/div[last()]/p//text()|'
                '//h4[contains(text(),"Major")]/../../following-sibling::div//div[@data-hbui-filter-item="major"]/a[contains(@href, "/undergraduate/specialisations/2019/")]/div[last()]/p//text()|'
                '//h4[contains(text(),"Major")]/../../following-sibling::div//div[@data-hbui-filter-item="honours"]/a[contains(@href, "/undergraduate/specialisations/2019/")]/div[last()]/p//text()|'
                '//h4[contains(text(),"major")]/../../following-sibling::div//div[@data-hbui-filter-item="major"]/a[contains(@href, "/undergraduate/specialisations/2019/")]/div[last()]/p//text()'
            ).extract()
            if len(programme_list) == 0:
                programme_list = response.xpath(
                    '//h4[contains(text(),"Major")]/../../following-sibling::div//div[@data-hbui-filter-item="honours"]/a[contains(@href, "/undergraduate/specialisations/2019/")]/div[last()]/p//text()'
                ).extract()
            print("programme_list: ", programme_list)

            programme_list = list(set(programme_list))
            if item['degree_name'] is None:
                pass
            else:
                if len(programme_list) == 0:
                    programme_en = response.xpath(
                        "//span[@data-hbui='module-title']//text()"
                    ).extract_first(None)
                    print("programmen: ", programme_en)
                    item['programme_en'] = programme_en
                    yield item
                else:
                    for prog in programme_list:
                        item['programme_en'] = prog
                        yield item
        except Exception as e:
            with open(".//scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'w',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: TheUniversityOfMelbourne_U.py Projeto: histudent/python_spider

    def parse_programme_message(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "The University of Melbourne"
        # item['country'] = 'Australia'
        # item['website'] = 'http://www.unimelb.edu.au/'
        item['url'] = response.url
        item['degree_type'] = 1
        print("=========================")
        print(response.url)
        try:
            programme = response.xpath(
                "//div[@class='headline']/h1/text()|//h1[@id='page-header']//text()"
            ).extract()
            clear_space(programme)
            item['programme_en'] = ''.join(programme).strip()
            print("item['programme_en']: ", item['programme_en'])

            # //div[@class='description']
            overview_en = response.xpath(
                "//h3[contains(text(),'Careers')]/preceding-sibling::*|"
                "//section[@id='course-overview']//div[@class='course-section__main course-section__main-with-aside']"
            ).extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview_en))
            print("item['overview_en']: ", item['overview_en'])

            career_en = response.xpath(
                "//h3[contains(text(),'Careers')]/preceding-sibling::*[1]/following-sibling::*"
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career_en))
            # print("item['career_en']: ", item['career_en'])

            modules = response.xpath(
                "//div[@class='description']/following-sibling::*").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # print("item['modules_en']: ", item['modules_en'])
            modulesUrl = response.url + "what-will-i-study/"
            # print(modulesUrl)
            item['modules_en'] = self.parse_modules(modulesUrl)
            print("item['modules_en']: ", item['modules_en'])

            # https://futurestudents.unimelb.edu.au/admissions/entry-requirements/language-requirements/undergraduate-toefl-ielts
            item[
                'ielts_desc'] = "an overall band score of 6.5 or more in the Academic International English Language Testing System (IELTS), with no bands less than 6.0"
            item["ielts"] = '6.5'
            item["ielts_l"] = '6.0'
            item["ielts_s"] = '6.0'
            item["ielts_r"] = '6.0'
            item["ielts_w"] = '6.0'

            item['toefl_code'] = '0974'
            item["toefl"] = '79'
            item["toefl_l"] = '13'
            item["toefl_s"] = '18'
            item["toefl_r"] = '13'
            item["toefl_w"] = '21'

            # https://futurestudents.unimelb.edu.au/admissions/applications/ug-int
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div class='main page-body' id='main-content' role='main'>
            <h1>International undergraduate</h1>
<div id="content_div_597198">
<p>Starting at university can be daunting, but applying for a place shouldn't be. Here's a guide to help you through the application process at Melbourne.</p><div class="col-1 first step-arrow"><h3>Step 1</h3></div><div class="col-5"><h2>Before you apply</h2><p>The first step is to figure out which course you want to study, and if you meet all of that course's entry requirements. At this stage, you should:</p><ul><li>Find <a href="http://coursesearch.unimelb.edu.au" target="_blank">the right course</a> for you, and make sure you meet the <a href="https://futurestudents.unimelb.edu.au/admissions/entry-requirements/undergraduate-international">entry requirements</a></li><li>If you're unsure, you may need to <a href="https://futurestudents.unimelb.edu.au/start-here">check if you're an international student</a></li><li>Check that you meet the <a href="https://futurestudents.unimelb.edu.au/admissions/entry-requirements/language-requirements">English language requirements</a></li><li>Make sure you're eligible for the appropriate <a href="http://www.services.unimelb.edu.au/international/visas/" target="_blank">visa to study in Australia</a></li><li>Find out what the <a href="https://futurestudents.unimelb.edu.au/admissions/fees/ug-intl">fees</a> for your course are and what <a href="https://scholarships.unimelb.edu.au">scholarships</a> are available to help you</li><li>Check to see if you're eligible for any of our <a href="https://futurestudents.unimelb.edu.au/admissions/high_achievers_programs">High Achievers' programs</a></li><li>Review our <a href="https://futurestudents.unimelb.edu.au/admissions/admission_with_credit">Admission with credit pages</a> if you are currently studying at another university or have completed post-secondary studies.</li><li>Look at the <a href="http://services.unimelb.edu.au/finaid/planning/cost_of_living" target="_blank">cost of living in Melbourne</a>, and what <a href="http://services.unimelb.edu.au/finaid" target="_blank">financial assistance</a> is available to you while you study.</li></ul><p>If you'd prefer to speak to someone in person about your application, contact a <a href="https://futurestudents.unimelb.edu.au/info/overseas-representatives">University of Melbourne representative</a> in your country.</p></div><hr /><div class="col-1 first step-arrow"><h3>Step 2</h3></div><div class="col-5"><h2>How to apply</h2><h3>Studying an Australian or NZ Year 12</h3><p>If you are studying any Australian or NZ year 12 program (including WACE/AUSMAT, SACE/SAM or NCEA) whether in Australia or an another country you should apply through the Victorian Tertiary Admissions Centre (VTAC). Full details about the VTAC application process can be found on the <a href="http://www.vtac.edu.au" target="_blank">VTAC website</a>. Through VTAC, you can list up to 8 course preferences. You should list courses in your order of preference with the course of greatest interest listed first. Be sure that your application is finalised by the <a href="http://www.vtac.edu.au/dates.html" target="_blank">VTAC due dates</a>.</p><p>If you have previously applied to the University of Melbourne via VTAC but were not offered a place or were offered but never enrolled, you should apply as a new student.</p><h3>Current University of Melbourne international students</h3><p>If you're an international student currently studying at the University of Melbourne and you wish to transfer to another University of Melbourne course, you must submit your transfer application via the <a href="http://www.vtac.edu.au/" target="_blank">Victorian Tertiary Admissions Centre (VTAC)</a>. Be sure that your application is finalised by the <a href="http://www.vtac.edu.au/dates.html" target="_blank">VTAC due dates</a>.</p><h3>All other international students</h3><p>You can <a href="https://futurestudents.unimelb.edu.au/admissions/applications/online-application-info">apply online</a> using our e-application.</p><p>Please ensure your application is received by us before the <a href="https://futurestudents.unimelb.edu.au/admissions/dates">relevant deadline</a>. An application fee of AUD$100 applies. This fee is non-refundable.</p><p>Alternatively, if you would prefer to apply in person in your own country, we have a number of overseas representatives in a variety of countries. Search our list of <a href="https://futurestudents.unimelb.edu.au/info/overseas-representatives">overseas representatives</a> to find one near you.</p><p>If you have accepted an offer from another institution in Australia, been granted a Confirmation of Enrolment (COE) and want to transfer to the University of Melbourne within the first six months of study you will need a letter of release from that institution. Please see our <a href="https://futurestudents.unimelb.edu.au/admissions/applications/other-applications/transferring-course/international_student_transfer_policy">International Student Transfer Policy for more details</a>.</p><p><strong>International Baccalaureate and US Advanced Placement students</strong></p><p>You can choose to have your IB or AP results sent directly to the University as soon as they are released. Please make sure you advise us in your application if you have authorised the release of your results.</p><p>University of Melbourne AP institution code: 9015<br /> University of Melbourne IB institution code: 002406</p><h3>Study abroad or exchange</h3><p>If you're interested in studying at Melbourne for a shorter period - one or two semesters - please refer to the <a href="http://www.mobility.unimelb.edu.au/inbound/index.html">Melbourne Global Mobility</a> site.</p></div><hr /><div class="col-1 first step-arrow"><h3><a name="accept" id="accept"></a>Step 3</h3></div><div class="col-5"><h2><a name="accept" id="accept"></a>After you apply - accepting your offer</h2><h3>Acknowledgement</h3><p>When you submit your e-application you will be automatically sent an acknowledgement email.&nbsp;&nbsp;The acknowledgement letter will include your unique student ID and application reference number. Please quote these numbers in all correspondence with the University.</p><p>We will begin by checking that your application contains everything we need to begin assessment.&nbsp;&nbsp;If anything is missing we will email you.</p><p>If your application is complete and we do not require any further information then your application will be assessed. This takes approximately two to four weeks for undergraduate courses.</p><h3>Offer process</h3><p>If your application is successful, your offer letter will be emailed directly to you (and copied to your nominated authorised representative, unless you have applied through VTAC). If the offer is conditional, then you need to meet the conditions of your offer before accepting the offer. If you have been sent an unconditional offer, you can choose to accept it immediately.</p><p>To accept your offer follow the instructions in your offer letter.</p><p>If you choose not to accept the offer right away, you can also:</p><ul><li>Consider <a href="http://students.unimelb.edu.au/get-started">deferring your offer</a></li><li>Ask to be considered for a different course than the one you originally applied for: <ul><li><strong>If you applied through VTAC,</strong> you may be able to <a href="http://www.cop.unimelb.edu.au">change your preferences</a>.</li><li>If you applied directly using our e-application, you can login to your user account and change your preference order and/or submit a new application.</li><li><strong>If you applied directly not using our e-application,</strong> you will need to submit your change of preference via email to International Admissions.</li></ul></li><li><a href="https://futurestudents.unimelb.edu.au/admissions/applications/non-acceptance" target="_blank">Decline your offer</a></li></ul><p>Unsuccessful applicants will receive a letter by mail (or fax to your nominated authorised representative) explaining why the application has been unsuccessful.</p><h3>Are you under 18?</h3><p>If you are an international student who will be under 18 years of age when entering Australia, you will need to confirm you have appropriate accommodation, support and general welfare arrangements in place before you can accept your offer. You will need to meet one of the three requirements below:</p><ul><li>Living with a parent</li><li>Living with a relative</li><li>Other approved care arrangement.</li></ul><p>You can also enrol in the <a href="http://services.unimelb.edu.au/international/under18/supervision-program" target="_blank">University of Melbourne Under 18 Supervision Program</a>. Find out more about <a href="http://services.unimelb.edu.au/international/under18" target="_blank">students under 18</a>.</p></div><hr /><div class="col-1 first step-arrow"><h3>Step 4</h3></div><div class="col-5"><h2><a name="prepare" id="prepare"></a>Preparing for study</h2><div class="col-2" style="float:right; margin-top:10px;"><a href="https://my.unimelb.edu.au" target="_blank"><img src="https://futurestudents.unimelb.edu.au/__data/assets/image/0004/1094539/Student-contact-details-notice.jpg" /></a></div><p>Once you've received and accepted your offer, it's time to get ready to move to Melbourne! You'll need to find a place to live, decide whether you need to work while you study and learn about life in your new city. Below are some helpful resources, including enrolment information, to make the transition easier for you.</p><h3>Visas</h3><p>If you haven't already got your visa to study in Australia, now is the time to do that. All citizens of countries other than New Zealand or Australia need a visa to study here. You should have received information about applying for a student visa with your offer of a place from the University.</p><ul><li><a href="http://services.unimelb.edu.au/international/visas/apply" target="_blank">Applying for a student visa</a></li><li><a href="//services.unimelb.edu.au/international/visas/conditions-and-validity" target="_blank">Student visa conditions</a></li><li><a href="http://services.unimelb.edu.au/international/visas/oshc" target="_blank">Overseas Student Health Cover (OSHC)</a></li></ul><h3>Organising your arrival</h3> Each semester, International Student Services organises pre-departure briefings in a number of countries. All commencing international students and their families are invited to attend the briefings prior to your arrival in Melbourne. This will help you understand more about what life in Melbourne will be like. <ul><li><a href="http://services.unimelb.edu.au/international/planning" target="_blank">Pre-departure briefings</a></li></ul><p>If you can't make it to a pre-departure briefing or there isn't one near you, don't worry. There is a lot of information <a href="http://services.unimelb.edu.au/international/planning" target="_blank">right here</a> that can help you find your way.</p><h3>Accommodation</h3><p>Finding a place to live can be complicated from a distance. Melbourne offers plenty of housing options. Some students choose to live in campus residences, some choose to stay with an Australian family, while most Australian students choose 'share housing', where a number of students live together close to the University.</p><ul><li>Find out more about <a href="https://futurestudents.unimelb.edu.au/explore/accommodation">Accommodation in Melbourne</a></li></ul><p>Need somewhere to stay until longer term housing is available? You can request <a href="https://services.unimelb.edu.au/housing/moving-to-melbourne/temporary-accommodation">temporary accommodation</a> before you arrive in Melbourne. There's also <a href="https://services.unimelb.edu.au/housing">longer term housing</a> available for all students including Study Abroad and Exchange students.</p><ul><li>Read more about <a href="https://services.unimelb.edu.au/housing/moving-to-melbourne">moving to Melbourne</a></li></ul><h3>Enrolment and orientation</h3><p>The first step in your new academic life is enrolling and attending orientation, designed to help ease your entry into campus life. To assist you with your move to Australian tertiary study the <a href="http://studentconnect.unimelb.edu.au" target="_blank">Student Connect website</a> has advice and information to help you understand all aspects of university life, including what happens at <a href="//orientation.unimelb.edu.au" target="_blank">enrolment and orientation</a>.</p><ul><li><a href="http://services.unimelb.edu.au/international/life-and-study" target="_blank">Getting used to a new country</a></li></ul><h3>Work while you study</h3><p>Some students choose to work while they&rsquo;re studying. Student visas allow you to work, however you must comply with the conditions on your visa. For more information, see <a href="http://services.unimelb.edu.au/international/visas/working-while-studying" target="_blank">Work while studying</a></p><h3>Fun while you study</h3> Being at university isn't all hard work. Life on campus can be great fun too! For more information on activities and events outside of classes, take a look at the following: <ul><li><a href="https://futurestudents.unimelb.edu.au/explore/student-experience">Life at Melbourne</a></li><li><a href="http://www.sport.unimelb.edu.au/Clubs" target="_blank">Sports clubs</a></li><li><a href="http://union.unimelb.edu.au/clubs" target="_blank">Clubs and societies</a></li></ul><h3>Leadership and Volunteering</h3><p>Challenge yourself, develop confidence, or enhance your leadership/team and interpersonal skills. Would you like to get involved in the community, connect with others at university and make new friends? How about gaining work experience, going on an adventure or just having fun? If your answer is yes, come and visit&nbsp;<a href="http://equity.unimelb.edu.au/initiatives">Equity and student engagement initiatives</a>.</p><h3>More services</h3><p>Check out all of our fantastic support services to help you out while you study. Our services include: Careers and Employment, Child Care Services, Counselling Service and many more. See <a href="http://services.unimelb.edu.au/" target="_blank">Services for Students</a> for more information.</p></div>
</div>
    </div>"""
                ]))

            degree_name_urls = response.xpath(
                "//span[@class='category']/a/@href|"
                "//div[@class='parent-courses']/a/@href").extract()
            print("degree_name_urls: ", degree_name_urls)
            # if len(degree_name_urls) > 0:
            for link in degree_name_urls:
                # degree_url = "https://coursesearch.unimelb.edu.au" + link
                degree_url = "https://study.unimelb.edu.au" + link
                # print("===", degree_url)
                self.parse_data(degree_url, item)
                yield item
                # print("***")
                # yield scrapy.Request(url, callback=self.parse_data, meta={"programme_en": programme, "overview_en": overview_en, "career_en": career_en, "modules_en": modules})
        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: LaTrobeUniversity_U.py Projeto: histudent/python_spider

    def parses(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = 'La Trobe University'
        item['url'] = response.url
        # item['location']='Melbourne'
        item['degree_type'] = 1
        print("================================================")
        print(response.url)
        try:
            # 学位名称
            degree_name = response.xpath(
                '//h1[contains(text(),"Bachelor of")]/text()').extract()
            clear_space(degree_name)
            item['degree_name'] = ''.join(degree_name).strip()
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Bachelor",
                                item['degree_name'].replace("(Honours)", ""))
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2:
                programme_re = re.findall(
                    r"\(.+\)",
                    item['degree_name'].replace("(Advanced)",
                                                "").replace("(Honours)",
                                                            "").strip())
                if len(programme_re) > 0:
                    item['programme_en'] = ''.join(programme_re).replace(
                        "(", "").replace(")", "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace(
                        "Bachelor of",
                        "").replace("(Honours)", "").replace("Master of ",
                                                             "").strip()
                print("item['programme_en']: ", item['programme_en'])

                start_date = response.xpath(
                    '//div[contains(text(),"tart")]/following-sibling::div//text()'
                ).extract()
                # print('start_date: ',start_date)
                item['start_date'] = getStartDateMonth(''.join(start_date))
                if item['start_date'] == "":
                    item['start_date'] = ''.join(start_date).strip()
                # print("item['start_date']: ", item['start_date'])

                duration = response.xpath(
                    '//div[contains(text(),"uration")]/following-sibling::div//text()'
                ).extract()
                # print('duration: ',duration)
                item['duration'] = ''.join(duration).strip()
                # print("item['duration']: ", item['duration'])

                fee = response.xpath(
                    '//h3[contains(text(),"tuition fee")]/following-sibling::p[1]/text()'
                ).extract()
                # print('fee: ',fee)
                fee = ''.join(fee).strip()
                tuition = fee.replace(' ', '')
                item['tuition_fee'] = tuition[0:99]
                # print("item['tuition_fee']: ", item['tuition_fee'])

                overview = response.xpath(
                    '//section[@id="overview"]/div[@class="block"]').extract()
                item['degree_overview_en'] = remove_class(
                    clear_lianxu_space(overview))
                # print("item['degree_overview_en']: ", item['degree_overview_en'])

                rntry = response.xpath(
                    '//section[@id="entry-requirements"]').extract()
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space(rntry))
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                career = response.xpath(
                    '//section[@id="career-outcomes"]').extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # print("item['career_en']: ", item['career_en'])

                htp = response.xpath('//section[@id="how-to-apply"]').extract()
                item['apply_desc_en'] = remove_class(clear_lianxu_space(htp))
                # print("item['apply_desc_en']: ", item['apply_desc_en'])

                # //ul[@class='list-arrows']//li
                location_dict = {
                    'BU': 'Melbourne',
                    'BE': 'Bendigo',
                    'CI': 'City',
                    'MI': 'Mildura',
                    'OT': 'Other',
                    'FS': 'Franklin Street',
                    'SH': 'Shepparton',
                    'SY': 'Sydney',
                    'ON': 'Online',
                    'WO': 'Albury-Wodonga',
                }
                location = response.xpath(
                    "//ul[@class='list-arrows']//li//text()").extract()
                # print("location: ", location)
                item['location'] = ''.join(location).replace("(Bundoora)",
                                                             "").strip()
                if item['location'] == "":
                    location_key = response.url.replace(
                        "https://www.latrobe.edu.au/courses/data/2019/international/",
                        "").strip()
                    # print("location_key1: ", location_key)
                    location_key = location_key.split("/")[0]
                    # print("location_key: ", location_key)
                    item['location'] = location_dict.get(
                        ''.join(location_key).upper())
                # print("item['location']: ", item['location'])

                ielts = response.xpath(
                    '//p[contains(text(),"IELTS")]/text()').extract()
                item['ielts_desc'] = ''.join(ielts).strip()
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ielts = get_ielts(item['ielts_desc'])
                item['ielts'] = ielts.get('IELTS')
                item['ielts_l'] = ielts.get('IELTS_L')
                item['ielts_s'] = ielts.get('IELTS_S')
                item['ielts_r'] = ielts.get('IELTS_R')
                item['ielts_w'] = ielts.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #        item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                modules_url = response.xpath(
                    '//ul[@class="list-arrows"]/li[1]/a/@href').extract()
                clear_space(modules_url)
                if modules_url != []:
                    try:
                        item['modules_en'] = self.parse_modules(modules_url[0])
                    except:
                        item['modules_en'] = ""
                # print("item['modules_en']: ", item['modules_en'])

                item[
                    'apply_proces_en'] = "https://www.latrobe.edu.au/international/how-to-apply/undergraduate-and-postgraduate"

                item['overview_en'] = item['degree_overview_en']
                # programme_major = response.xpath('//section[@id="overview"]/div[@class="block"]//ul/li').extract()
                programme_major = response.xpath(
                    # '//p[contains(text(),"Our Majors are:")]/following-sibling::ul/li|'
                    '//p[contains(text(),"Melbourne majors")]/following-sibling::ul[1]/li|'
                    '//p[contains(text(),"Choose from five majors")]/following-sibling::ul[1]/li|'
                    '//th[contains(text(),"Minors")]/../preceding-sibling::*//td[contains(text(),"Yes")][1]/preceding-sibling::td|'
                    '//p[contains(text(),"disciplines:")]/following-sibling::ul[1]/li|'
                    # '//th[contains(text(),"Minors")]/../preceding-sibling::tr|'
                    '//p[contains(text(),"subjects and electives including")]/following-sibling::ul[1]/li'
                ).extract()
                print(len(programme_major))
                if len(programme_major) == 0:
                    yield item
                else:
                    for maj in programme_major:
                        print("***************************" +
                              str(programme_major.index(maj) + 1) +
                              "****************************")
                        programme_major1 = response.xpath(
                            # '//p[contains(text(),"Our Majors are:")]/following-sibling::ul/li|'
                            '//p[contains(text(),"Melbourne majors")]/following-sibling::ul[1]/li//a[contains(text(),'
                            + '"' + remove_tags(maj) + '"' + ')]/@href|'
                            '//p[contains(text(),"Choose from five majors")]/following-sibling::ul[1]/li//a[contains(text(),'
                            + '"' + remove_tags(maj) + '"' + ')]/@href|'
                            '//th[contains(text(),"Minors")]/../preceding-sibling::*//td[contains(text(),"Yes")][1]/preceding-sibling::td//a[contains(text(),'
                            + '"' + remove_tags(maj) + '"' + ')]/@href|'
                            '//p[contains(text(),"disciplines:")]/following-sibling::ul[1]/li//a[contains(text(),'
                            + '"' + remove_tags(maj) + '"' + ')]/@href|'
                            # '//th[contains(text(),"Minors")]/../preceding-sibling::tr|'
                            '//p[contains(text(),"subjects and electives including")]/following-sibling::ul[1]/li//a[contains(text(),'
                            + '"' + remove_tags(maj) + '"' +
                            ')]/@href').extract()
                        # programme_major1 = response.xpath("//a[contains(text(),"+"'"+remove_tags(maj)+"'"+")]/@href").extract()
                        if len(programme_major1) == 0:
                            item['programme_en'] = remove_tags(maj).replace(
                                "Yes", "").replace("*", "").strip()
                            print("不用跳转的item['programme_en']_major: ",
                                  item['programme_en'])
                            yield item
                        else:
                            programme_dict_list = self.parse_major(
                                programme_major1[0], remove_tags(maj))
                            print("programme_dict_list: ", programme_dict_list)
                            for programme_dict in programme_dict_list:
                                item['programme_en'] = programme_dict.get(
                                    'programme_en')
                                item['overview_en'] = programme_dict.get(
                                    'overview_en')

                                # item['programme_en'] = ''.join(programme_major1).strip()
                                print("跳转之后的链接item['programme_en']_major: ",
                                      item['programme_en'])
                                print("跳转之后的链接item['overview_en']_major: ",
                                      item['overview_en'])
                                yield item
                        # programme_major1 = response.xpath(
                        #                     # '//p[contains(text(),"Our Majors are:")]/following-sibling::ul[1]/li['+str(i+1)+']//a/@href|'
                        #                      '//p[contains(text(),"Melbourne majors")]/following-sibling::ul[1]/li['+str(i+1)+']//a/@href'
                        #                     '//p[contains(text(),"Choose from five majors")]/following-sibling::ul[1]/li['+str(i+1)+']//a/@href|'
                        #                     # '//h3[contains(text(),"Specialisations, majors and minors")]/following-sibling::table/tbody/tr|'
                        #                     '//p[contains(text(),"disciplines:")]/following-sibling::ul/li['+str(i+1)+']//a/@href|'
                        #                     '//th[contains(text(),"Minors")]/../preceding-sibling::tr['+str(i+1)+']//a/@href|'
                        #                     '//p[contains(text(),"subjects and electives including")]/following-sibling::ul[1]/li['+str(i+1)+']//a/@href').extract()
                        # clear_space(programme_major1)
                        # print("programme_major1: ", programme_major1)
                        # if len(programme_major1) > 0:
                        #     major_url = programme_major1[0]
                        #     programme_dict_list = self.parse_major(major_url)
                        #     print("programme_dict_list: ", programme_dict_list)
                        #     for programme_dict in programme_dict_list:
                        #         item['programme_en'] = programme_dict.get('programme_en')
                        #         item['overview_en'] = programme_dict.get('overview_en')
                        #
                        #         # item['programme_en'] = ''.join(programme_major1).strip()
                        #         print("跳转之后的链接item['programme_en']_major: ", item['programme_en'])
                        #         print("跳转之后的链接item['overview_en']_major: ", item['overview_en'])
                        #         yield item
                        # else:
                        #     programme_major1 = response.xpath(
                        #         '//p[contains(text(),"Our Majors are:")]/following-sibling::ul[1]/li['+str(i+1)+']//text()|'
                        #         '//p[contains(text(),"Choose from five majors")]/following-sibling::ul[1]/li['+str(i+1)+']//text()|'
                        #         # '//h3[contains(text(),"Specialisations, majors and minors")]/following-sibling::table/tbody/tr|'
                        #         '//p[contains(text(),"disciplines:")]/following-sibling::ul[1]/li['+ str(i+1)+']//text()|'
                        #         '//th[contains(text(),"Minors")]/../preceding-sibling::tr['+str(i+1)+']//text()|'
                        #         '//p[contains(text(),"subjects and electives including")]/following-sibling::ul[1]/li['+str(i+1)+']//text()').extract()
                        #     item['programme_en'] = ''.join(programme_major1).replace("Yes", "").replace("*", "").strip()
                        #     print("不用跳转的item['programme_en']_major: ", item['programme_en'])
                        #     yield item
        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 9

0

Exibir arquivo

    def parse_data(self, response):
        # 判断是否学位下面还有专业
        specialisations = response.xpath(
            "//h2[contains(text(),'Specialisations')]/following-sibling::*//a/@href"
        ).extract()
        # print("specialisations: ", specialisations, response.url)
        if len(specialisations) > 0:
            for link in specialisations:
                if "http" in link:
                    url = link
                else:
                    url = "http://study.unisa.edu.au" + link
                yield scrapy.Request(url, callback=self.parse_data)
        else:
            item = get_item(ScrapyschoolAustralianBenItem)
            item['university'] = "University of South Australia"
            # item['country'] = 'Australia'
            # item['website'] = 'http://www.unisa.edu.au/'
            item['url'] = response.url
            print("===========================")
            print(response.url)
            item['degree_type'] = 1
            try:
                programme = response.xpath(
                    "//div[@class='title-row']/h1/text()").extract()
                clear_space(programme)
                item['degree_name'] = ''.join(programme).replace(
                    "(International)", "").strip()
                print("item['degree_name']: ", item['degree_name'])

                pro_re = re.findall(r"Bachelor", item['degree_name'])
                print("pre_re: ", pro_re)
                if len(pro_re) < 2:
                    programme_re = re.findall(r"\(.+\)", item['degree_name'])
                    print("programme_re: ", programme_re)
                    if len(programme_re) > 0:
                        if ''.join(programme_re).strip() != "(Honours)":
                            item['programme_en'] = ''.join(
                                programme_re).replace("(",
                                                      "").replace(")",
                                                                  "").strip()
                        else:
                            item['programme_en'] = item['degree_name'].replace(
                                "Bachelor of",
                                "").replace("(Honours)",
                                            "").strip().strip("in").strip()
                    else:
                        item['programme_en'] = item['degree_name'].replace(
                            "Bachelor of", "").strip().strip("in").strip()
                    print("item['programme_en']: ", item['programme_en'])

                    start_date = response.xpath(
                        "//span[contains(text(), 'Start')]/../text()").extract(
                        )
                    clear_space(start_date)
                    # print("start_date: ", start_date)
                    item['start_date'] = getStartDateMonth(
                        ', '.join(start_date))
                    print("item['start_date']: ", item['start_date'])

                    # //span[contains(text(),'Campus')]/../a
                    location = response.xpath(
                        "//span[contains(text(),'Campus')]/../a//text()"
                    ).extract()
                    clear_space(location)
                    item['location'] = ''.join(location).strip()
                    print("item['location']: ", item['location'])

                    duration = response.xpath(
                        "//span[contains(text(),'Duration')]/../text()"
                    ).extract()
                    clear_space(duration)
                    item['duration'] = ''.join(duration).strip()
                    print("item['duration']: ", item['duration'])

                    tuition_fee = response.xpath(
                        "//span[contains(text(),'2019: AUD$')]//text()|"
                        "//span[contains(text(),'Fees')]/../text()").extract()
                    print("tuition_fee: ", tuition_fee)
                    clear_space(tuition_fee)
                    tuition_fee = getTuition_fee(''.join(tuition_fee))
                    item['tuition_fee'] = str(tuition_fee)
                    if item['tuition_fee'] == '0':
                        item['tuition_fee'] = None
                    print("item['tuition_fee']: ", item['tuition_fee'])

                    # //span[contains(text(),'English Language Requirements')]/..
                    ielts = response.xpath(
                        "//span[contains(text(),'English Language Requirements')]/../ul//text()"
                    ).extract()
                    clear_space(ielts)
                    item['ielts_desc'] = ' '.join(ielts).strip()
                    print("item['ielts_desc']: ", item['ielts_desc'])

                    ieltlsrw = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
                    if len(ieltlsrw) > 0:
                        item["ielts"] = ieltlsrw[0]

                    ielts_l_re = re.findall(r"listening\s\[.*?\]",
                                            item['ielts_desc'])
                    item["ielts_l"] = ''.join(ielts_l_re).replace(
                        "listening", "").replace("[", "").replace("]",
                                                                  "").strip()

                    ielts_s_re = re.findall(r"speaking\s\[.*?\]",
                                            item['ielts_desc'])
                    item["ielts_s"] = ''.join(ielts_s_re).replace(
                        "speaking", "").replace("[", "").replace("]",
                                                                 "").strip()

                    ielts_r_re = re.findall(r"reading\s\[.*?\]",
                                            item['ielts_desc'])
                    item["ielts_r"] = ''.join(ielts_r_re).replace(
                        "reading", "").replace("[", "").replace("]",
                                                                "").strip()

                    ielts_w_re = re.findall(r"writing\s\[.*?\]",
                                            item['ielts_desc'])
                    item["ielts_w"] = ''.join(ielts_w_re).replace(
                        "writing", "").replace("[", "").replace("]",
                                                                "").strip()
                    print(
                        "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                        % (item['ielts'], item['ielts_l'], item['ielts_s'],
                           item['ielts_r'], item['ielts_w']))

                    # //div[@class='page-info-block-inner']//ul[@id='entry-requirements']
                    entry_requirements = response.xpath(
                        "//div[@class='page-info-block-inner']//ul[@id='entry-requirements']"
                    ).extract()
                    item['rntry_requirements_en'] = remove_class(
                        clear_lianxu_space(entry_requirements))
                    print("item['rntry_requirements_en']: ",
                          item['rntry_requirements_en'])

                    degree_overview_en = response.xpath(
                        "//h2[contains(text(),'Degree overview')]/../../.."
                    ).extract()
                    item['degree_overview_en'] = remove_class(
                        clear_lianxu_space(degree_overview_en))
                    print("item['degree_overview_en']: ",
                          item['degree_overview_en'])

                    overview_en = response.xpath(
                        "//h2[contains(text(),'Snapshot')]/..|"
                        "//h3[contains(text(),'Snapshot')]/..").extract()
                    item['overview_en'] = remove_class(
                        clear_lianxu_space(overview_en))
                    print("item['overview_en']: ", item['overview_en'])

                    modules_en = response.xpath(
                        "//h2[@class='theme-white'][contains(text(), 'Degree structure')]/../..|"
                        "//h3[contains(text(),'Degree structure')]/../.."
                    ).extract()
                    item['modules_en'] = remove_class(
                        clear_lianxu_space(modules_en))
                    print("item['modules_en']: ", item['modules_en'])

                    career_en = response.xpath(
                        "//h2[contains(text(),'Your career')]/../../..|"
                        "//h3[contains(text(),'Your career')]/..").extract()
                    item['career_en'] = remove_class(
                        clear_lianxu_space(career_en))
                    print("item['career_en']: ", item['career_en'])

                    apply_desc_en = response.xpath(
                        "//h2[contains(text(),'How to apply')]/../../.."
                    ).extract()
                    item['apply_desc_en'] = remove_class(
                        clear_lianxu_space(apply_desc_en))
                    print("item['apply_desc_en']: ", item['apply_desc_en'])

                    if "research" not in item['degree_name']:
                        yield item
            except Exception as e:
                with open("scrapySchool_Australian_ben/error/" +
                          item['university'] + str(item['degree_type']) +
                          ".txt",
                          'a',
                          encoding="utf-8") as f:
                    f.write(
                        str(e) + "\n" + response.url +
                        "\n========================\n")
                print("异常：", str(e))
                print("报错url：", response.url)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: SouthernCrossUniversity_U.py Projeto: histudent/python_spider

    def content(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "Southern Cross University"
        # item['country'] = 'Australia'
        # item['website'] = 'https://www.scu.edu.au/'
        item['url'] = response.url
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        try:
            programme = response.xpath(
                "//h1[@class='pageTitleFixSource']//text()").extract()
            clear_space(programme)
            item['degree_name'] = ''.join(programme)
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Bachelor", item['degree_name'])
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2:
                duration = response.xpath(
                    "//div[@id='international']//td[contains(text(),'Duration')]/following-sibling::td//text()"
                ).extract()
                clear_space(duration)
                item['duration'] = ''.join(duration).strip()
                print("item['duration']: ", item['duration'])

                if "full" in item['duration'].lower():
                    programme_re = re.findall(
                        r"\(.+\)", item['degree_name'].replace(
                            "(Honours)", "").replace("with Honours", ""))
                    if len(programme_re) > 0:
                        if len(programme_re) != "(Honours)":
                            item['programme_en'] = ''.join(
                                programme_re).replace("(",
                                                      "").replace(")",
                                                                  "").strip()
                        else:
                            item['programme_en'] = item['degree_name'].replace(
                                "Bachelor of", "").replace("(Honours)",
                                                           "").strip()
                    else:
                        in_re = re.findall(
                            r"in\s.*", item['degree_name'].replace(
                                "(Honours)", "").replace("with Honours", ""))
                        if len(in_re) > 0:
                            item['programme_en'] = ''.join(
                                in_re).strip().strip("in").strip()
                        else:
                            item['programme_en'] = item['degree_name'].replace(
                                "Bachelor of", "").replace("with Honours",
                                                           "").strip()
                    print("item['programme_en']: ", item['programme_en'])

                    overview = response.xpath(
                        "//div[@class='summary']").extract()
                    item['degree_overview_en'] = remove_class(
                        clear_lianxu_space(overview))
                    item['overview_en'] = item['degree_overview_en']
                    # if item['degree_overview_en'] == "":
                    #     print("***degree_overview_en 为空")
                    # print("item['degree_overview_en']: ", item['degree_overview_en'])

                    career = response.xpath(
                        "//h3[contains(text(), 'Career opportunities')]/.."
                    ).extract()
                    item['career_en'] = remove_class(
                        clear_lianxu_space(career))
                    # if item['career_en'] == "":
                    #     print("***career_en 为空")
                    # print("item['career_en']: ", item['career_en'])

                    start_date = response.xpath(
                        "//h3[contains(text(),'International students studying in Australia')]/..//div[@class='accordion course-apply-accordion']//div/h5/span//text()"
                    ).extract()
                    print("start_date: ", start_date)
                    if start_date:
                        item['start_date'] = ','.join(start_date).strip()
                    print("item['start_date']: ", item['start_date'])

                    tuition_fee = response.xpath(
                        "//div[@id='international']//div[@class='table-grid table-responsive no-overflow']//tbody/tr/td[3]//text()"
                    ).extract()
                    clear_space(tuition_fee)
                    item['tuition_fee'] = '; '.join(tuition_fee).strip()
                    print("item['tuition_fee']: ", item['tuition_fee'])

                    # //tr[@class='data-label-Overall']/td[2]
                    IELTS = response.xpath(
                        "//tr[@class='data-label-Overall']/td[2]//text()|//tr[@class='data-label-Overall Score']/td[2]//text()|"
                        "//td[contains(text(),'Overall Score')]/following-sibling::td//text()"
                    ).extract()
                    clear_space(IELTS)
                    item['ielts'] = ','.join(IELTS).strip()
                    print("item['ielts']: ", item['ielts'])

                    IELTS_L = response.xpath(
                        "//tr[@class='data-label-Listening']/td[2]//text()"
                    ).extract()
                    clear_space(IELTS_L)
                    item['ielts_l'] = ','.join(IELTS_L).strip()
                    # print("item['ielts_l']: ", item['ielts_l'])

                    IELTS_S = response.xpath(
                        "//tr[@class='data-label-Speaking']/td[2]//text()"
                    ).extract()
                    clear_space(IELTS_S)
                    item['ielts_s'] = ','.join(IELTS_S).strip()
                    # print("item['ielts_s']: ", item['ielts_s'])

                    IELTS_R = response.xpath(
                        "//tr[@class='data-label-Reading']/td[2]//text()"
                    ).extract()
                    clear_space(IELTS_R)
                    item['ielts_r'] = ','.join(IELTS_R).strip()
                    # print("item['ielts_r']: ", item['ielts_r'])

                    IELTS_W = response.xpath(
                        "//tr[@class='data-label-Writing']/td[2]//text()"
                    ).extract()
                    clear_space(tuition_fee)
                    item['ielts_w'] = ','.join(IELTS_W).strip()
                    # print("item['ielts_w']: ", item['ielts_w'])

                    average_score = response.xpath(
                        "//tr[@class='data-label-China Senior Middle 3']//text() | //tr[@class='data-label-China Gao Kao']//text()"
                    ).extract()
                    clear_space(average_score)
                    # item['average_score'] = ','.join(average_score).strip()
                    # print("item['average_score']: ", item['average_score'])

                    modules = response.xpath(
                        "//div[@id='structure']").extract()
                    item['modules_en'] = remove_class(
                        clear_lianxu_space(modules))
                    # print("item['modules_en']: ", item['modules_en'])

                    # //h2[contains(text(),'Admission requirements')]|//h2[contains(text(),'Admission requirements')]/following-sibling::div[1]
                    rntry_requirements_en = response.xpath(
                        "//h2[contains(text(),'Admission requirements')]|//h2[contains(text(),'Admission requirements')]/following-sibling::div[1]"
                    ).extract()
                    item['rntry_requirements_en'] = remove_class(
                        clear_lianxu_space(rntry_requirements_en))
                    # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                    how_to_apply = response.xpath(
                        "//div[@id='apply']").extract()
                    item['apply_desc_en'] = remove_class(
                        clear_lianxu_space(how_to_apply))
                    # print("item['apply_desc_en']: ", item['apply_desc_en'])

                    other = response.xpath(
                        "//div[@id='international']//text()").extract()
                    clear_space(other)
                    # item['other'] = ''.join(other).strip()
                    # print("item['other']: ", item['other'])

                    location = response.xpath(
                        "//div[@id='international']//td[contains(text(),'Availability details')]/following-sibling::td//tbody/tr[position()<last()]/td[1]//text()"
                    ).extract()
                    clear_space(location)
                    item['location'] = ', '.join(location).strip()
                    print("item['location']: ", item['location'])

                    if item['location'] != "SCU Online":
                        major_list = response.xpath(
                            "//h3[contains(text(),'Specialisations')]/../../following-sibling::tr[@class='header-row text group-hdr']//h4//text()"
                        ).extract()
                        clear_space(major_list)
                        print("major_list: ", major_list)
                        print(len(major_list))

                        if len(major_list) == 0:
                            yield item
                        else:
                            modules_list = response.xpath(
                                "//h3[contains(text(),'Specialisations')]/../../following-sibling::tr[@class='header-row text group-hdr']//h4/following-sibling::div"
                            ).extract()
                            print("===", modules_list)
                            print(len(modules_list))
                            if len(modules_list) == len(major_list):
                                for m in range(len(major_list)):
                                    item['programme_en'] = major_list[m]
                                    item['modules_en'] = remove_class(
                                        clear_lianxu_space([modules_list[m]]))
                                    print("item['programme_en']: ",
                                          item['programme_en'])
                                    yield item
        except Exception as e:
            with open("scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 11

0

Exibir arquivo

    def parse(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "The University of New South Wales"
        # item['country'] = 'Australia'
        # item['website'] = 'http://www.unsw.edu.au/'
        item['url'] = response.url
        item['degree_type'] = 1
        # item['teach_time'] = 'coursework'
        print("===========================")
        print(response.url)
        try:
            department = response.xpath(
                "//div[@class='inlinevideo-inner']//div[@class='contentarea-title']/h3//text()"
            ).extract()
            item['department'] = ''.join(department).strip()
            print("item['department']: ", item['department'])

            # 学位类型列表
            degree_type = response.xpath(
                "//section//div[@class='degree js-degree']//h5//text()"
            ).extract()
            clear_space(degree_type)
            print(len(degree_type))
            print("degree_type: ", degree_type)

            duration = response.xpath(
                "//section//dt[contains(text(), 'Minimum years')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(duration)
            print(len(duration))
            print("duration: ", duration)

            start_date = response.xpath(
                "//section//dt[contains(text(), 'Entry')]/following-sibling::dd[1]//text()"
            ).extract()
            clear_space(start_date)
            print(len(start_date))
            print("start_date: ", start_date)

            tuition_fee = response.xpath(
                "//section//dt[contains(text(), 'Estimated first year tuition')]/following-sibling::*[1]//text()"
            ).extract()
            clear_space(tuition_fee)
            print(len(tuition_fee))
            print("tuition_fee: ", tuition_fee)

            careerEle = response.xpath("//section//dl[last()]")
            print(len(careerEle))
            print("careerEle: ", careerEle)

            for i in range(len(degree_type)):
                print("-------------------" + str(i) + "-----------------")
                item['degree_name'] = degree_type[i]
                print("item['degree_name']: ", item['degree_name'])

                # 课程长度
                item['duration'] = duration[i]
                print("item['duration']: ", item['duration'])

                # 开学时间
                item['start_date'] = start_date[i]
                if "and" in item['start_date']:
                    start_date_sp = item['start_date'].split("and")
                else:
                    start_date_sp = [item['start_date']]
                # print(start_date_sp)
                start_date_str = ""
                for st in start_date_sp:
                    start_date_str += getStartDate(st).replace("0", "") + ","
                item['start_date'] = start_date_str.strip().strip(',').strip()
                print("item['start_date']: ", item['start_date'])

                # 学费
                item['tuition_fee'] = tuition_fee[i].replace(
                    "AUD $", "").replace(",", "").strip()
                print("item['tuition_fee']: ", item['tuition_fee'])

                # print(careerEle[i])
                careerRe = careerEle[i].xpath(
                    ".//dt[contains(text(), 'Career Opportunities')]|.//dt[contains(text(), 'Career Opportunities')]/following-sibling::dd[1]"
                ).extract()
                item['career_en'] = remove_class(clear_lianxu_space(careerRe))
                print("item['career_en']: ", item['career_en'])

                if "Graduate" not in item['degree_name']:
                    yield item

            # programme = response.xpath("//div[@class='internalContentWrapper']/h1[1]//text()").extract()
            # programme = ''.join(programme)
            # programme = programme.split("-")
            # item['programme_en'] = programme[0].strip()
            # print("item['programme_en']: ", item['programme_en'])

            # yield item
        except Exception as e:
            with open(".//scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'w',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: DeakinUniversity_U.py Projeto: histudent/python_spider

    def parse(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "Deakin University"
        # item['country'] = 'Australia'
        # item['website'] = 'http://www.deakin.edu.au'
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        # 组合字典
        links = [
            "http://www.deakin.edu.au/course/bachelor-arts-international",
            "http://www.deakin.edu.au/course/bachelor-arts-honours-international",
            "http://www.deakin.edu.au/course/bachelor-arts-psychology-international",
            "http://www.deakin.edu.au/course/bachelor-arts-psychology-honours-international",
            "http://www.deakin.edu.au/course/bachelor-arts-advanced-honours-international",
            "http://www.deakin.edu.au/course/bachelor-arts-chinese-bachelor-commerce-international",
            "http://www.deakin.edu.au/course/bachelor-arts-master-teaching-secondary-international",
            "http://www.deakin.edu.au/course/bachelor-arts-bachelor-commerce-international",
            "http://www.deakin.edu.au/course/bachelor-arts-bachelor-laws-international",
            "http://www.deakin.edu.au/course/bachelor-arts-bachelor-science-international",
            "http://www.deakin.edu.au/course/bachelor-arts-master-arts-international-relations-international",
            "http://www.deakin.edu.au/course/bachelor-biomedical-science-international",
            "http://www.deakin.edu.au/course/bachelor-business-international",
            "http://www.deakin.edu.au/course/bachelor-business-sport-management-international",
            "http://www.deakin.edu.au/course/bachelor-commerce-international",
            "http://www.deakin.edu.au/course/bachelor-commerce-bachelor-information-systems-international",
            "http://www.deakin.edu.au/course/bachelor-commerce-bachelor-laws-international",
            "http://www.deakin.edu.au/course/bachelor-commerce-bachelor-science-international",
            "http://www.deakin.edu.au/course/bachelor-communication-advertising-international",
            "http://www.deakin.edu.au/course/bachelor-communication-digital-media-international",
            "http://www.deakin.edu.au/course/bachelor-communication-honours-international",
            "http://www.deakin.edu.au/course/bachelor-communication-journalism-international",
            "http://www.deakin.edu.au/course/bachelor-communication-public-relations-international",
            "http://www.deakin.edu.au/course/bachelor-computer-science-international",
            "http://www.deakin.edu.au/course/bachelor-construction-management-honours-international",
            "http://www.deakin.edu.au/course/bachelor-creative-arts-drama-international",
            "http://www.deakin.edu.au/course/bachelor-creative-arts-honours-international",
            "http://www.deakin.edu.au/course/bachelor-creative-arts-photography-international",
            "http://www.deakin.edu.au/course/bachelor-creative-arts-visual-arts-international",
            "http://www.deakin.edu.au/course/bachelor-creative-writing-international",
            "http://www.deakin.edu.au/course/bachelor-criminology-international",
            "http://www.deakin.edu.au/course/bachelor-criminology-bachelor-cyber-security-international",
            "http://www.deakin.edu.au/course/bachelor-criminology-bachelor-laws-international",
            "http://www.deakin.edu.au/course/bachelor-criminology-bachelor-psychological-science-international",
            "http://www.deakin.edu.au/course/bachelor-cyber-security-international",
            "http://www.deakin.edu.au/course/bachelor-design-3d-animation-international",
            "http://www.deakin.edu.au/course/bachelor-design-architecture-international",
            "http://www.deakin.edu.au/course/bachelor-design-architecture-bachelor-construction-management-honours-international",
            "http://www.deakin.edu.au/course/bachelor-design-digital-technologies-international",
            "http://www.deakin.edu.au/course/bachelor-design-visual-communication-international",
            "http://www.deakin.edu.au/course/bachelor-education-early-years-international",
            "http://www.deakin.edu.au/course/bachelor-education-primary-international",
            "http://www.deakin.edu.au/course/bachelor-environmental-engineering-honours-international",
            "http://www.deakin.edu.au/course/bachelor-environmental-science-environmental-management-and-sustainability-international",
            "http://www.deakin.edu.au/course/bachelor-environmental-science-honours-international",
            "http://www.deakin.edu.au/course/bachelor-environmental-science-marine-biology-international",
            "http://www.deakin.edu.au/course/bachelor-environmental-science-wildlife-and-conservation-biology-international",
            "http://www.deakin.edu.au/course/bachelor-exercise-and-sport-science-international",
            "http://www.deakin.edu.au/course/bachelor-exercise-and-sport-science-honours-international",
            "http://www.deakin.edu.au/course/bachelor-exercise-and-sport-science-bachelor-business-sport-management-international",
            "http://www.deakin.edu.au/course/bachelor-film-television-and-animation-international",
            "http://www.deakin.edu.au/course/bachelor-food-and-nutrition-sciences-honours-international",
            "http://www.deakin.edu.au/course/bachelor-forensic-science-international",
            "http://www.deakin.edu.au/course/bachelor-forensic-science-honours-international",
            "http://www.deakin.edu.au/course/bachelor-forensic-science-bachelor-criminology-international",
            "http://www.deakin.edu.au/course/bachelor-health-sciences-international",
            "http://www.deakin.edu.au/course/bachelor-health-sciences-honours-international",
            "http://www.deakin.edu.au/course/bachelor-health-sciences-bachelor-arts-international",
            "http://www.deakin.edu.au/course/bachelor-health-and-medical-science-honours-international",
            "http://www.deakin.edu.au/course/bachelor-health-and-physical-education-international",
            "http://www.deakin.edu.au/course/bachelor-information-systems-international",
            "http://www.deakin.edu.au/course/bachelor-information-systems-bachelor-information-technology-international",
            "http://www.deakin.edu.au/course/bachelor-information-technology-international",
            "http://www.deakin.edu.au/course/bachelor-information-technology-honours-international",
            "http://www.deakin.edu.au/course/bachelor-international-studies-international",
            "http://www.deakin.edu.au/course/bachelor-international-studies-global-scholar-international",
            "http://www.deakin.edu.au/course/bachelor-international-studies-bachelor-commerce-international",
            "http://www.deakin.edu.au/course/bachelor-laws-international",
            "http://www.deakin.edu.au/course/bachelor-laws-bachelor-international-studies-international",
            "http://www.deakin.edu.au/course/bachelor-nursing-international",
            "http://www.deakin.edu.au/course/bachelor-nursing-honours-international",
            "http://www.deakin.edu.au/course/bachelor-nursing-bachelor-midwifery-international",
            "http://www.deakin.edu.au/course/bachelor-nursing-bachelor-psychological-science-international",
            "http://www.deakin.edu.au/course/bachelor-nursing-bachelor-public-health-and-health-promotion-international",
            "http://www.deakin.edu.au/course/bachelor-nutrition-science-international",
            "http://www.deakin.edu.au/course/bachelor-nutrition-science-bachelor-commerce-international",
            "http://www.deakin.edu.au/course/bachelor-occupational-therapy-international",
            "http://www.deakin.edu.au/course/bachelor-property-and-real-estate-international",
            "http://www.deakin.edu.au/course/bachelor-property-and-real-estate-bachelor-commerce-international",
            "http://www.deakin.edu.au/course/bachelor-property-and-real-estate-bachelor-laws-international",
            "http://www.deakin.edu.au/course/bachelor-psychological-science-international",
            "http://www.deakin.edu.au/course/bachelor-psychological-science-honours-international",
            "http://www.deakin.edu.au/course/bachelor-public-health-and-health-promotion-international",
            "http://www.deakin.edu.au/course/bachelor-public-health-and-health-promotion-honours-international",
            "http://www.deakin.edu.au/course/bachelor-public-health-and-health-promotion-bachelor-commerce-international",
            "http://www.deakin.edu.au/course/bachelor-science-international",
            "http://www.deakin.edu.au/course/bachelor-science-honours-international",
            "http://www.deakin.edu.au/course/bachelor-science-master-teaching-secondary-international",
            "http://www.deakin.edu.au/course/bachelor-science-bachelor-laws-international",
            "http://www.deakin.edu.au/course/bachelor-social-work-international",
            "http://www.deakin.edu.au/course/bachelor-software-engineering-honours-international",
            "http://www.deakin.edu.au/course/bachelor-sport-development-international",
            "http://www.deakin.edu.au/course/bachelor-vision-science-master-optometry-international",
            "http://www.deakin.edu.au/course/bachelor-zoology-and-animal-science-international",
            "http://www.deakin.edu.au/course/bachelor-zoology-and-animal-science-honours-international",
        ]
        programme_dict = {}
        programme_list = [
            "Bachelor of Arts",
            "Bachelor of Arts (Honours)",
            "Bachelor of Arts (Psychology)",
            "Bachelor of Arts (Psychology) (Honours)",
            "Bachelor of Arts - Advanced (Honours)",
            "Bachelor of Arts - Chinese/Bachelor of Commerce",
            "Bachelor of Arts / Master of Teaching (Secondary)",
            "Bachelor of Arts/Bachelor of Commerce",
            "Bachelor of Arts/Bachelor of Laws",
            "Bachelor of Arts/Bachelor of Science",
            "Bachelor of Arts/Master of Arts (International Relations)",
            "Bachelor of Biomedical Science",
            "Bachelor of Business",
            "Bachelor of Business (Sport Management)",
            "Bachelor of Commerce",
            "Bachelor of Commerce/Bachelor of Information Systems",
            "Bachelor of Commerce/Bachelor of Laws",
            "Bachelor of Commerce/Bachelor of Science",
            "Bachelor of Communication (Advertising)",
            "Bachelor of Communication (Digital Media)",
            "Bachelor of Communication (Honours)",
            "Bachelor of Communication (Journalism)",
            "Bachelor of Communication (Public Relations)",
            "Bachelor of Computer Science",
            "Bachelor of Construction Management (Honours)",
            "Bachelor of Creative Arts (Drama)",
            "Bachelor of Creative Arts (Honours)",
            "Bachelor of Creative Arts (Photography)",
            "Bachelor of Creative Arts (Visual Arts)",
            "Bachelor of Creative Writing",
            "Bachelor of Criminology",
            "Bachelor of Criminology/Bachelor of Cyber Security",
            "Bachelor of Criminology/Bachelor of Laws",
            "Bachelor of Criminology/Bachelor of Psychological Science",
            "Bachelor of Cyber Security",
            "Bachelor of Design (3D Animation)",
            "Bachelor of Design (Architecture)",
            "Bachelor of Design (Architecture)/Bachelor of Construction Management (Honours)",
            "Bachelor of Design (Digital Technologies)",
            "Bachelor of Design (Visual Communication)",
            "Bachelor of Education (Early Years)",
            "Bachelor of Education (Primary)",
            "Bachelor of Environmental Engineering (Honours)",
            "Bachelor of Environmental Science (Environmental Management and Sustainability)",
            "Bachelor of Environmental Science (Honours)",
            "Bachelor of Environmental Science (Marine Biology)",
            "Bachelor of Environmental Science (Wildlife and Conservation Biology)",
            "Bachelor of Exercise and Sport Science",
            "Bachelor of Exercise and Sport Science (Honours)",
            "Bachelor of Exercise and Sport Science/Bachelor of Business (Sport Management)",
            "Bachelor of Film, Television and Animation",
            "Bachelor of Food and Nutrition Sciences (Honours)",
            "Bachelor of Forensic Science",
            "Bachelor of Forensic Science (Honours)",
            "Bachelor of Forensic Science/Bachelor of Criminology",
            "Bachelor of Health Sciences",
            "Bachelor of Health Sciences (Honours)",
            "Bachelor of Health Sciences/Bachelor of Arts",
            "Bachelor of Health and Medical Science (Honours)",
            "Bachelor of Health and Physical Education",
            "Bachelor of Information Systems",
            "Bachelor of Information Systems/Bachelor of Information Technology",
            "Bachelor of Information Technology",
            "Bachelor of Information Technology (Honours)",
            "Bachelor of International Studies",
            "Bachelor of International Studies (Global Scholar)",
            "Bachelor of International Studies/Bachelor of Commerce",
            "Bachelor of Laws",
            "Bachelor of Laws/Bachelor of International Studies",
            "Bachelor of Nursing",
            "Bachelor of Nursing (Honours)",
            "Bachelor of Nursing/Bachelor of Midwifery",
            "Bachelor of Nursing/Bachelor of Psychological Science",
            "Bachelor of Nursing/Bachelor of Public Health and Health Promotion",
            "Bachelor of Nutrition Science",
            "Bachelor of Nutrition Science/Bachelor of Commerce",
            "Bachelor of Occupational Therapy",
            "Bachelor of Property and Real Estate",
            "Bachelor of Property and Real Estate/Bachelor of Commerce",
            "Bachelor of Property and Real Estate/Bachelor of Laws",
            "Bachelor of Psychological Science",
            "Bachelor of Psychological Science (Honours)",
            "Bachelor of Public Health and Health Promotion",
            "Bachelor of Public Health and Health Promotion (Honours)",
            "Bachelor of Public Health and Health Promotion/Bachelor of Commerce",
            "Bachelor of Science",
            "Bachelor of Science (Honours)",
            "Bachelor of Science / Master of Teaching (Secondary)",
            "Bachelor of Science/Bachelor of Laws",
            "Bachelor of Social Work",
            "Bachelor of Software Engineering (Honours)",
            "Bachelor of Sport Development",
            "Bachelor of Vision Science/Master of Optometry",
            "Bachelor of Zoology and Animal Science",
            "Bachelor of Zoology and Animal Science (Honours)",
        ]
        for link in range(len(links)):
            url = links[link]
            programme_dict[url] = programme_list[link]
        item['major_type1'] = programme_dict.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            programme = response.xpath(
                "//div[@class='module__banner-title']/h1//text()").extract()
            clear_space(programme)
            item['degree_name'] = ''.join(programme).strip()
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Bachelor", item['degree_name'])
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2:
                programme_re = re.findall(
                    r"\(.+\)", item['degree_name'].replace("(Honours)", ""))
                if len(programme_re) > 0:
                    item['programme_en'] = ''.join(programme_re).replace(
                        "(", "").replace(")", "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace(
                        "(Honours)",
                        "").replace("Master of ",
                                    "").replace("Bachelor of", "").strip()
                item['programme_en'] = item['programme_en'].replace(
                    "  ", " ").strip()
                print("item['programme_en']: ", item['programme_en'])

                # //div[@class='module__summary--items']/div[1]/div[2]
                ielts = response.xpath(
                    "//h3[contains(text(),'English language requirements')]/../following-sibling::*[1]//text()"
                ).extract()
                clear_space(ielts)
                item['ielts_desc'] = ''.join(ielts).strip()
                print("item['ielts_desc']: ", item['ielts_desc'])

                ielts_d = get_ielts(item['ielts_desc'])
                item["ielts"] = ielts_d.get('IELTS')
                item["ielts_l"] = ielts_d.get('IELTS_L')
                item["ielts_s"] = ielts_d.get('IELTS_S')
                item["ielts_r"] = ielts_d.get('IELTS_R')
                item["ielts_w"] = ielts_d.get('IELTS_W')
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                duration = response.xpath(
                    "//h3[contains(text(),'Duration')]/../following-sibling::div//text()"
                ).extract()
                clear_space(duration)
                # print("duration: ", duration)
                duration_re = re.findall(r".*full[\s\-]time",
                                         ''.join(duration).strip())
                item['duration'] = ''.join(duration_re).strip()
                # if item['duration'] == "":
                #     print("***duration 为空")
                # print("item['duration']: ", item['duration'])

                location = response.xpath(
                    "//div[@class='module__summary--icon-wrapper']//h3[@class='course__subheading'][contains(text(),'Campuses')]/../following-sibling::div//text()"
                ).extract()
                clear_space(location)
                item['location'] = ' '.join(location).strip()
                location_tmp = item['location']
                # print("item['location']: ", item['location'])

                # //div[@id='navigation__course']/following-sibling::div
                overview = response.xpath(
                    "//h2[contains(text(),'Course information')]/../.."
                ).extract()
                item['degree_overview_en'] = remove_class(
                    clear_lianxu_space(overview))
                # if item['degree_overview_en'] == "":
                #     print("***degree_overview_en 为空")
                # print("item['degree_overview_en']: ", item['degree_overview_en'])

                modules = response.xpath(
                    "//div[@id='module__course-structure']").extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # if item['modules_en'] == "":
                #     print("***modules_en 为空")
                # print("item['modules_en']: ", item['modules_en'])

                start_date = response.xpath(
                    "//li[contains(text(),'Start date:')]//text()").extract()
                clear_space(start_date)
                # print("start_date: ", start_date)
                item['start_date'] = getStartDateMonth(
                    ' '.join(start_date).strip())
                # print("item['start_date']: ", item['start_date'])

                entry_requirements = response.xpath(
                    "//div[@data-section='entry requirements']").extract()
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space(entry_requirements))
                # if item['rntry_requirements_en'] == "":
                #     print("***rntry_requirements_en 为空")
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                # //div[@data-section='fees and scholarships']
                tuition_fee = response.xpath(
                    "//div[@class='module__content-panel']//div[@class='module__key-information--item-content']/text()"
                ).extract()
                clear_space(tuition_fee)
                # print("tuition_fee: ", tuition_fee)
                tuition_fee = getTuition_fee(''.join(tuition_fee))
                item['tuition_fee'] = tuition_fee
                if item['tuition_fee'] == 0:
                    item['tuition_fee'] = None
                # print("item['tuition_fee']: ", item['tuition_fee'])

                career = response.xpath(
                    "//div[@data-section='graduate outcomes']|//div[@data-section='graduate outcomes']/following-sibling::div[1]|"
                    "//h3[contains(text(),'Career outcomes')]/../..").extract(
                    )
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # if item['career_en'] == "":
                #     print("***career_en 为空")
                # print("item['career_en']: ", item['career_en'])

                # //div[@data-section='application information']/following-sibling::div[2]
                how_to_apply = response.xpath(
                    "//h3[contains(text(),'How to apply')]/../..").extract()
                item['apply_desc_en'] = remove_class(
                    clear_lianxu_space(how_to_apply))
                # if item['apply_desc_en'] == "":
                #     print("***apply_desc_en 为空")
                # print("item['apply_desc_en']: ", item['apply_desc_en'])

                major_list_url = response.xpath(
                    "//h3[contains(text(), 'Major Sequences')]/..//a/@href|"
                    "//h3[contains(text(), 'Major sequences')]/..//a/@href|"
                    "//h3[contains(text(), 'Major sequences')]/following-sibling::ul[1]//a/@href|"
                    "//td[contains(text(),'Major')]/preceding-sibling::td//a/@href"
                ).extract()
                clear_space(major_list_url)
                print("major_list_url: ", major_list_url)
                print(len(major_list_url))

                major_url_l = []
                for major_url in major_list_url:
                    if "major" in major_url:
                        major_url_l.append(major_url)
                print("major_url_l: ", major_url_l)
                print(len(major_url_l))
                if len(major_url_l) == 0:
                    item['url'] = response.url
                    print("item['url']2: ", item['url'])
                    yield item
                else:
                    for major_url in major_url_l:
                        headers_base = {
                            'User-Agent':
                            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
                        }
                        data = requests.get(major_url, headers=headers_base)
                        response_major = etree.HTML(data.text)
                        item['url'] = major_url
                        print("item['url']_major: ", item['url'])

                        programme_major = response_major.xpath(
                            "//div[@class='module__banner-title']/h1//text()")
                        item['programme_en'] = ''.join(programme_major).strip()
                        print("item['programme_en']_major: ",
                              item['programme_en'])

                        location_major = response_major.xpath(
                            "//*[contains(text(),'Campuses')]/../following-sibling::div[1]//text()"
                        )
                        item['location'] = ''.join(location_major).strip()
                        if item['location'] == "":
                            item['location'] = location_tmp
                        # print("item['location']_major: ", item['location'])

                        overview_en = response_major.xpath(
                            "//h2[contains(text(),'Overview')]/../..")
                        overview_en_str = ""
                        if len(overview_en) > 0:
                            for o in overview_en:
                                overview_en_str += etree.tostring(
                                    o, encoding='unicode', method='html')
                        item['overview_en'] = remove_class(
                            clear_lianxu_space([overview_en_str]))
                        # print("item['overview_en']_major: ", item['overview_en'])

                        modules_en = response_major.xpath(
                            "//h2[contains(text(),'Explore units')]/../..")
                        modules_en_str = ""
                        if len(modules_en) > 0:
                            for o in modules_en:
                                modules_en_str += etree.tostring(
                                    o, encoding='unicode', method='html')
                        item['modules_en'] = remove_class(
                            clear_lianxu_space([modules_en_str]))
                        # print("item['modules_en']_major: ", item['modules_en'])
                        yield item
                        # else:
                        #     item['url'] = response.url
                        #     print("item['url']1: ", item['url'])
                        #     yield item
        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 13

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "The University of Melbourne"
        print("================================================")
        print(response.url)
        item['url'] = response.url
        item['degree_type'] = 1
        item['department'] = response.meta.get('department')
        print("item['department']: ", item['department'])
        try:
            degree_name = response.xpath(
                "//div[@class='headline']/h1/text()|//h1[@id='page-header']//text()"
            ).extract()
            item['degree_name'] = ''.join(degree_name).strip()
            print("item['degree_name']: ", item['degree_name'])

            programme = re.findall(r"\(.*\)|\-.*", item['degree_name'])
            print(programme)
            if len(programme) > 0:
                item['degree_name'] = item['degree_name'].replace(
                    ''.join(programme), '').strip()
                item['programme_en'] = ''.join(programme).replace(
                    "(", "").replace(")", "").replace("-", "").strip()
            else:
                item['programme_en'] = item['degree_name'].replace(
                    "Master of", "").strip()
            print("item['degree_name']=: ", item['degree_name'])
            print("item['programme_en']: ", item['programme_en'])

            duration = response.xpath(
                "//div[@class='course-length icn icn-duration']/text()|//li[contains(text(),'full time')]//text()"
            ).extract()
            clear_space(duration)
            # print("duration:", duration)
            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[-1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            location = response.xpath(
                "//li[@id='course-overview-campus']//text()|//li[contains(text(),'Campus')]//text()"
            ).extract()
            # print(location, '==')
            item['location'] = ''.join(location).replace(
                "On Campus", "").replace("(", "").replace(")", "").strip()
            print("item['location']: ", item['location'])

            if item['location'].lower() != "online":

                start_date = response.xpath(
                    "//li[@id='course-overview-entryPeriods']//text()"
                ).extract()
                # print(start_date, '==')
                start_date_str = getStartDateMonth(''.join(start_date))
                item['start_date'] = start_date_str
                # print("item['start_date']: ", item['start_date'])

                overview_en = response.xpath(
                    "//div[@class='course-content']").extract()
                item['degree_overview_en'] = remove_class(
                    clear_lianxu_space(overview_en))
                # print("item['overview_en']: ", item['overview_en'])

                career_url = response.xpath(
                    "//a[contains(text(),'Where will this take me?')]/@href"
                ).extract_first()
                if career_url:
                    item['career_en'] = self.parse_career(
                        parse.urljoin(response.url, career_url))
                # print("item['career_en']: ", item['career_en'])

                entry_url = response.xpath(
                    "//a[contains(text(),'Entry requirements')]/@href"
                ).extract_first()
                if entry_url:
                    item['rntry_requirements_en'] = self.parse_entry(
                        parse.urljoin(response.url, entry_url))
                print("item['rntry_requirements_en']: ",
                      item['rntry_requirements_en'])

                fee_url = response.xpath(
                    "//a[contains(text(),'Fees & scholarships')]/@href"
                ).extract_first()
                if fee_url:
                    item['tuition_fee'] = self.parse_fee(
                        parse.urljoin(response.url, fee_url))
                print("item['tuition_fee']: ", item['tuition_fee'])

                # https://study.unimelb.edu.au/how-to-apply/english-language-requirements/undergraduate-english-language-requirements
                item[
                    'ielts_desc'] = " you need a score of 6.5 or more in the Academic International English Language Testing System (IELTS), with no bands less than 6.0."
                item["ielts"] = '6.5'
                item["ielts_l"] = '6.0'
                item["ielts_s"] = '6.0'
                item["ielts_r"] = '6.0'
                item["ielts_w"] = '6.0'
                item[
                    'toefl_desc'] = "a score of 79 and scores of 21 for writing, 18 for speaking, 13 for reading, 13 for listening for an internet-based test. To submit your scores when you apply, use our TOEFL Institution Code: 0974."
                item["toefl"] = '79'
                item["toefl_l"] = '13'
                item["toefl_s"] = '18'
                item["toefl_r"] = '13'
                item["toefl_w"] = '21'

                print(
                    "ielts: ",
                    item['ielts'],
                    ' - ',
                    item['ielts_l'],
                    ' - ',
                    item['ielts_s'],
                    ' - ',
                    item['ielts_r'],
                    ' - ',
                    item['ielts_w'],
                )
                print(
                    "toefl: ",
                    item['toefl'],
                    ' - ',
                    item['toefl_l'],
                    ' - ',
                    item['toefl_s'],
                    ' - ',
                    item['toefl_r'],
                    ' - ',
                    item['toefl_w'],
                )

                # 匹配跳转之后获取modules
                modules_url = response.xpath(
                    "//a[contains(text(),'What will I study?')]/@href"
                ).extract_first()
                major_list = []
                major_overview_list = []
                if modules_url:
                    modules = self.parse_modules(
                        parse.urljoin(response.url, modules_url))
                    print("modules: ", modules)
                    item['modules_en'] = modules[0]
                    major_list = modules[1]
                    major_overview_list = modules[2]
                    print(len(major_list), "=====", len(major_overview_list))
                    print(major_list)
                print("item['modules_en']: ", item['modules_en'])

                # 有多个专业和一个专业的区分插入
                if len(major_list) > 0:
                    if len(major_list) == len(major_overview_list):
                        for i in range(len(major_list)):
                            item['programme_en'] = major_list[i]
                            major_overview_str = ""
                            for m in major_overview_list[i]:
                                major_overview_str += etree.tostring(
                                    m,
                                    encoding='unicode',
                                    pretty_print=False,
                                    method='html')
                            item['overview_en'] = remove_class(
                                major_overview_str)
                            print("item['overview']==: ", item['overview_en'])
                            yield item
                else:
                    yield item

        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 14

0

Exibir arquivo

    def parse(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "Federation University Australia"
        # item['country'] = 'Australia'
        # item['website'] = 'https://search.federation.edu.au'
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        informationUrl = response.url.replace(
            "https://study.federation.edu.au/api/programs_plan_code",
            "https://study.federation.edu.au/#/course")
        print("------------", informationUrl)
        item['url'] = informationUrl
        try:
            # jsonData = clear_space_str(response.body).replace('\"', "'").replace(" ", "")
            jsonData = response.body
            informationDict = json.loads(jsonData)
            print(informationDict)

            international_details = informationDict.get(
                "international_details")
            # print("international_details: ", international_details)

            programme = informationDict.get("title")
            item['degree_name'] = programme
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Bachelor", item['degree_name'])
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2:
                programme_re = re.findall(r"\(.+\)", item['degree_name'])
                if len(programme_re) > 0:
                    if ''.join(programme_re) != "(Honours)":
                        item['programme_en'] = ''.join(programme_re).replace(
                            "(", "").replace(")", "").strip()
                    else:
                        item['programme_en'] = item['degree_name'].replace(
                            "Bachelor of", "").replace("(Honours)",
                                                       "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace(
                        "Bachelor of", "").strip()
                print("item['programme_en']: ", item['programme_en'])

                location = international_details.get("teaching_location")
                item['location'] = location
                # print("item['location']: ", item['location'])

                department = informationDict.get("school_dept")
                item['department'] = department
                # print("item['department']: ", item['department'])

                overviewHtml = informationDict.get("outline")
                # print("overviewHtml: ", overviewHtml)
                delFu = re.findall(r"&\w+;", overviewHtml)
                # print(delFu)
                if len(delFu) != 0:
                    for d in delFu:
                        overviewHtml = overviewHtml.replace(d, "")
                # pageHtml = '<!DOCTYPE html><html><body>' + overviewHtml + '</body></html>'
                item['degree_overview_en'] = remove_class(
                    clear_lianxu_space([overviewHtml]))
                # print("item['degree_overview_en']: ", item['degree_overview_en'])

                duration = international_details.get("duration")
                item['duration'] = duration
                # print("item['duration']: ", item['duration'])

                start_date = informationDict.get("commences")
                item['start_date'] = start_date
                # print("item['start_date']: ", item['start_date'])

                career1 = informationDict.get("careers")
                career1Str = ""
                # print(career1)
                if len(career1) != 0:
                    for career1dict in career1:
                        career1Str += "<p>" + career1dict.get("name") + "</p>"
                career2 = informationDict.get("career_opportunities")
                # print(career2)
                if "<p>" in career2:
                    delFu = re.findall(r"&\w+;", career2)
                    if len(delFu) != 0:
                        for d in delFu:
                            career2 = career2.replace(d, " ")
                    career2 = career2.replace("<br>", " ")
                    # pageHtml = '<!DOCTYPE html><html><body>' + career2 + '</body></html>'
                    career2 = remove_class(clear_lianxu_space([career2]))
                career = career1Str + career2
                item['career_en'] = career
                # print("item['career_en']: ", item['career_en'])

                tuition_fee = international_details.get("annual_fee_int")
                item['tuition_fee'] = tuition_fee
                # print("item['tuition_fee']: ", item['tuition_fee'])

                entry_requirements = international_details.get(
                    "academic_entry_requirements")
                entry_requirements1 = international_details.get(
                    "extra_requirements")
                delFu = re.findall(r"&\w+;", entry_requirements)
                if len(delFu) != 0:
                    for d in delFu:
                        entry_requirements = entry_requirements.replace(d, " ")
                # entry_requirementsHtml = '<!DOCTYPE html><html><body>' + entry_requirements + '</body></html>'
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space([entry_requirements])) + remove_class(
                        clear_lianxu_space([entry_requirements1]))
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                IELTS = international_details.get(
                    "english_language_requirement")
                delFu = re.findall(r"&\w+;", IELTS)
                if len(delFu) != 0:
                    for d in delFu:
                        IELTS = IELTS.replace(d, " ")
                IELTSHtml = '<!DOCTYPE html><html><body>' + IELTS + '</body></html>'
                html = etree.fromstring(IELTSHtml)
                IELTS = html.xpath("//p//text()")
                IELTS = ''.join(IELTS)
                item['ielts_desc'] = IELTS
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ieltlsrw = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
                # print(ieltlsrw)
                if len(ieltlsrw) == 1:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[0]
                    item["ielts_s"] = ieltlsrw[0]
                    item["ielts_r"] = ieltlsrw[0]
                    item["ielts_w"] = ieltlsrw[0]
                elif len(ieltlsrw) == 2:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[1]
                    item["ielts_s"] = ieltlsrw[1]
                    item["ielts_r"] = ieltlsrw[1]
                    item["ielts_w"] = ieltlsrw[1]
                elif len(ieltlsrw) == 3:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[1]
                    item["ielts_s"] = ieltlsrw[1]
                    item["ielts_r"] = ieltlsrw[2]
                    item["ielts_w"] = ieltlsrw[2]
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                # item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))

                # print("--ces")
                modules = informationDict.get("domestic_details").get(
                    "program_structures").get("majors")
                # print("modules: ", type(modules))
                if len(modules) != 0:
                    modules = modules[0].get("year_levels")
                # print("modules: ", modules)
                # print("modules: ", type(modules))
                modulesStr = ""
                for m in modules:
                    modulesStr += str(m)
                item['modules_en'] = "<div>" + modulesStr.replace(
                    "[", "").replace("]", "").replace("{", "").replace(
                        "}", "") + "</div>"
                print("item['modules_en']: ", item['modules_en'])

                # item['application_date'] = "Monday 5 March, 2018"
                # item['deadline'] = "TBC"
                # item['application_fee'] = "25"

                how_to_apply = informationDict.get("apply_link")
                item['apply_proces_en'] = how_to_apply
                # print("item['apply_proces_en']: ", item['apply_proces_en'])

                # driver = webdriver.Chrome(r"C:\Users\delsk\AppData\Local\Programs\Python\Python36-32\Lib\site-packages\selenium\chromedriver.exe")
                # driver = webdriver.PhantomJS(r"C:\Users\delsk\AppData\Local\Programs\Python\Python36-32\Lib\site-packages\selenium\phantomjs-2.1.1-windows\bin\phantomjs.exe")
                # driver.get(informationUrl)
                # print(driver.page_source)
                # modules = driver.find_element_by_xpath("//div[@class='no-print panel panel-default']").text
                # print(modules)
                # item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # print("item['modules_en']: ", item['modules_en'])
                yield item
        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 15

0

Exibir arquivo

Arquivo: TheUniversityOfMelbourne_U1.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "The University of Melbourne"
        # item['country'] = 'Australia'
        # item['website'] = 'http://www.unimelb.edu.au/'
        item['url'] = response.url
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        degree_type = response.meta['degree_type']
        item['degree_name'] = degree_type
        print("item['degree_name']: ", item['degree_name'])

        department = response.meta['department']
        item['department'] = department
        print("item['department']: ", item['department'])

        duration = response.meta['duration']
        item['duration'] = duration
        print("item['duration']: ", item['duration'])

        # mode = response.meta['mode']
        # item['mode'] = mode
        # print("item['mode']: ", item['mode'])

        location = response.meta['location']
        item['location'] = location
        print("item['location']: ", item['location'])

        degree_description = response.meta['degree_description']
        item['degree_overview_en'] = degree_description
        print("item['degree_overview_en']: ", item['degree_overview_en'])

        tuition_fee = response.meta['tuition_fee']
        item['tuition_fee'] = tuition_fee
        print("item['tuition_fee']: ", item['tuition_fee'])
        try:
            programme = response.xpath(
                "//div[@class='headline']/h1/text()").extract()
            clear_space(programme)
            programme = ''.join(programme)
            item['programme_en'] = programme
            print("item['programme_en']: ", item['programme_en'])

            # //div[@class='description']
            overviewCareer = response.xpath(
                "//div[@class='description']//text()").extract()
            clear_space(overviewCareer)
            # print("overviewCareer: ", overviewCareer)
            careerIndex = -1
            if "Careers" in overviewCareer:
                careerIndex = overviewCareer.index("Careers")
            overview = overviewCareer[:careerIndex]
            item['overview_en'] = ''.join(overview)
            # print("item['overview_en']: ", item['overview_en'])

            career = overviewCareer[careerIndex:]
            item['career_en'] = ''.join(career)
            # print("item['career_en']: ", item['career_en'])

            modules = response.xpath("//html//li/div[1]/a/text()").extract()
            modules = ''.join(modules)
            item[
                'modules_en'] = "Subjects you could take in this major: " + modules
            # print("item['modules_en']: ", item['modules_en'])

            #             item['entry_requirements'] = """To be eligible for entry to an undergraduate degree you must fulfill all of the following requirements:Complete the Victorian Certificate of Education (VCE) or an equivalent Australian or overseas qualification (see qualifications below)Complete and achieve the required grades in each of the prerequisite subjects for the course and any prerequisite tests, interviews or auditions (see Course Search for details about prerequisite subjects and scores)Meet the English language requirements"""
            #             item['average_score'] = '''Agriculture: 75; Arts: 80; Biomedicine: 91; Commerce: 86; Design: 80; Fine Arts: NA^; Music: 67^; Oral Health: 80#; Science: 80; '''
            #             item['IELTS'] = 'an overall band score of 6.5 or more in the Academic International English Language Testing System (IELTS), with no bands less than 6.0.'
            #             item['TOEFL'] = '''Paper based test - a score of 577 or more including a score of 4.5 in the Test of Written English.
            # Internet based test - a score of 79 and scores of:'''
            #             item['TOEFL_L'] = '13'
            #             item['TOEFL_S'] = '18'
            #             item['TOEFL_R'] = '13'
            #             item['TOEFL_W'] = '21'
            #             item['how_to_apply'] = "https://futurestudents.unimelb.edu.au/admissions/applications/ug-int"

            yield item
        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'w',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: EdithCowanUniversity_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "Edith Cowan University"
        # item['country'] = 'Australia'
        # item['website'] = 'https://www.uts.edu.au'
        item['url'] = response.url
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        # 组合字典
        links = [
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-south-west",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-psychology-and-addiction-studies",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-psychology-and-counselling",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-psychology",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-psychology-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-psychology-criminology-and-justice",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-bachelor-of-commerce",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-bachelor-of-media-and-communication",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-bachelor-of-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-commerce-bachelor-of-arts-psychology",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-contemporary-arts",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-counselling",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-criminology-and-justice",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-criminology-and-justice-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-design",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-laws-bachelor-of-arts",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-laws-bachelor-of-criminology-and-justice",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-laws-bachelor-of-psychological-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-media-and-communication",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-psychological-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-psychology",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-psychology-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-social-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-social-science-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-social-work",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-social-work-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-youth-work",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-bachelor-of-commerce",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-commerce",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-commerce-professional",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-commerce-bachelor-of-arts-psychology",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-honours-bachelor-of-commerce",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-honours-bachelor-of-laws",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-hospitality-and-tourism-management",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-international-hotel-and-resort-management",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-laws-graduate-entry",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-laws",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-laws-bachelor-of-arts",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-laws-bachelor-of-commerce",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-laws-bachelor-of-criminology-and-justice",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-laws-bachelor-of-psychological-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-marketing-advertising-and-public-relations",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-exercise-and-sports-science-bachelor-of-commerce-sport-business",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-bachelor-of-commerce",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-sport-recreation-and-event-management",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-aviation",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-chemical-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-civil-and-environmental-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-civil-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-computer-systems-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-computer-systems-honours-bachelor-of-computer-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-electrical-and-renewable-energy-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-electrical-power-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-electronics-and-communications-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-instrumentation-control-and-automation-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-marine-and-offshore-engineering-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-mechanical-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-mechatronics-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-mechatronics-honours-bachelor-of-technology-motorsports",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-naval-architecture-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-ocean-engineering-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-petroleum-engineering-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-honours-bachelor-of-commerce",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-honours-bachelor-of-laws",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-honours-bachelor-of-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-bachelor-of-commerce",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-technology-aeronautical",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-technology-electronic-and-computer-systems",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-technology-engineering",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-technology-motorsports",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-health-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-health-science-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-medical-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-biomedical-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-exercise-and-sports-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-exercise-and-sports-science-bachelor-of-commerce-sport-business",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-exercise-science-and-rehabilitation",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-medical-science-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-occupational-therapy",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-occupational-therapy-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-paramedical-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-sports-science-and-football",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-sports-science-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-bachelor-of-commerce",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-speech-pathology",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-speech-pathology-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-nursing-studies",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-nursing",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-nursing-bachelor-of-science-midwifery",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-computer-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-computer-science-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-counter-terrorism-security-and-intelligence",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-computer-systems-honours-bachelor-of-computer-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-engineering-honours-bachelor-of-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-information-technology",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-information-technology-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-biological-sciences",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-conservation-and-wildlife-biology",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-cyber-security",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-environmental-management",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-marine-and-freshwater-biology",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-mathematics-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-physics-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-security",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-security-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-science-bachelor-of-commerce",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-sustainability",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-education-early-childhood-studies",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-education-primary",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-education-secondary",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-acting",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-arts-management",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-arts-management-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-dance",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-dance-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-arts-music-theatre",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-education-secondary",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-music",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-music-honours",
            "http://www.ecu.edu.au/degrees/courses/bachelor-of-performing-arts",
        ]
        programme_dict = {}
        programme_list = [
            "Bachelor of Arts",
            "Bachelor of Arts (South West)",
            "Bachelor of Arts (Psychology and Addiction Studies)",
            "Bachelor of Arts (Psychology and Counselling)",
            "Bachelor of Arts (Psychology)",
            "Bachelor of Arts (Psychology) Honours",
            "Bachelor of Arts (Psychology, Criminology and Justice)",
            "Bachelor of Arts Honours",
            "Bachelor of Arts/Bachelor of Commerce",
            "Bachelor of Arts/Bachelor of Media and Communication",
            "Bachelor of Arts/Bachelor of Science",
            "Bachelor of Commerce/Bachelor of Arts (Psychology)",
            "Bachelor of Contemporary Arts",
            "Bachelor of Counselling",
            "Bachelor of Criminology and Justice",
            "Bachelor of Criminology and Justice Honours",
            "Bachelor of Design",
            "Bachelor of Laws/Bachelor of Arts",
            "Bachelor of Laws/Bachelor of Criminology and Justice",
            "Bachelor of Laws/Bachelor of Psychological Science",
            "Bachelor of Media and Communication",
            "Bachelor of Psychological Science",
            "Bachelor of Science (Psychology)",
            "Bachelor of Science (Psychology) Honours",
            "Bachelor of Social Science",
            "Bachelor of Social Science Honours",
            "Bachelor of Social Work",
            "Bachelor of Social Work Honours",
            "Bachelor of Youth Work",
            "Bachelor of Arts/Bachelor of Commerce",
            "Bachelor of Commerce",
            "Bachelor of Commerce Professional",
            "Bachelor of Commerce/Bachelor of Arts (Psychology)",
            "Bachelor of Engineering Honours/Bachelor of Commerce",
            "Bachelor of Engineering Honours/Bachelor of Laws",
            "Bachelor of Hospitality and Tourism Management",
            "Bachelor of International Hotel and Resort Management",
            "Bachelor of Laws (Graduate Entry)",
            "Bachelor of Laws",
            "Bachelor of Laws/Bachelor of Arts",
            "Bachelor of Laws/Bachelor of Commerce",
            "Bachelor of Laws/Bachelor of Criminology and Justice",
            "Bachelor of Laws/Bachelor of Psychological Science",
            "Bachelor of Marketing, Advertising and Public Relations",
            "Bachelor of Science (Exercise and Sports Science)/Bachelor of Commerce (Sport Business)",
            "Bachelor of Science/Bachelor of Commerce",
            "Bachelor of Sport, Recreation and Event Management",
            "Bachelor of Aviation",
            "Bachelor of Engineering (Chemical) Honours",
            "Bachelor of Engineering (Civil and Environmental) Honours",
            "Bachelor of Engineering (Civil) Honours",
            "Bachelor of Engineering (Computer Systems) Honours",
            "Bachelor of Engineering (Computer Systems) Honours/Bachelor of Computer Science",
            "Bachelor of Engineering (Electrical and Renewable Energy) Honours",
            "Bachelor of Engineering (Electrical Power) Honours",
            "Bachelor of Engineering (Electronics and Communications) Honours",
            "Bachelor of Engineering (Instrumentation Control and Automation) Honours",
            "Bachelor of Engineering (Marine and Offshore Engineering) Honours",
            "Bachelor of Engineering (Mechanical) Honours",
            "Bachelor of Engineering (Mechatronics) Honours",
            "Bachelor of Engineering (Mechatronics) Honours/Bachelor of Technology (Motorsports)",
            "Bachelor of Engineering (Naval Architecture) Honours",
            "Bachelor of Engineering (Ocean Engineering) Honours",
            "Bachelor of Engineering (Petroleum Engineering) Honours",
            "Bachelor of Engineering Honours/Bachelor of Commerce",
            "Bachelor of Engineering Honours/Bachelor of Laws",
            "Bachelor of Engineering Honours/Bachelor of Science",
            "Bachelor of Engineering Science",
            "Bachelor of Science/Bachelor of Commerce",
            "Bachelor of Technology (Aeronautical)",
            "Bachelor of Technology (Electronic and Computer Systems)",
            "Bachelor of Technology (Engineering)",
            "Bachelor of Technology (Motorsports)",
            "Bachelor of Health Science",
            "Bachelor of Health Science Honours",
            "Bachelor of Medical Science",
            "Bachelor of Science (Biomedical Science)",
            "Bachelor of Science (Exercise and Sports Science)",
            "Bachelor of Science (Exercise and Sports Science)/Bachelor of Commerce (Sport Business)",
            "Bachelor of Science (Exercise Science and Rehabilitation)",
            "Bachelor of Science (Medical Science) Honours",
            "Bachelor of Science (Occupational Therapy)",
            "Bachelor of Science (Occupational Therapy) Honours",
            "Bachelor of Science (Paramedical Science)",
            "Bachelor of Science (Sports Science and Football)",
            "Bachelor of Science (Sports Science) Honours",
            "Bachelor of Science/Bachelor of Commerce",
            "Bachelor of Speech Pathology",
            "Bachelor of Speech Pathology Honours",
            "Bachelor of Science (Nursing Studies)",
            "Bachelor of Science (Nursing)",
            "Bachelor of Science (Nursing)/Bachelor of Science (Midwifery)",
            "Bachelor of Computer Science",
            "Bachelor of Computer Science Honours",
            "Bachelor of Counter Terrorism Security and Intelligence",
            "Bachelor of Engineering (Computer Systems) Honours/Bachelor of Computer Science",
            "Bachelor of Engineering Honours/Bachelor of Science",
            "Bachelor of Information Technology",
            "Bachelor of Information Technology Honours",
            "Bachelor of Science",
            "Bachelor of Science (Biological Sciences)",
            "Bachelor of Science (Conservation and Wildlife Biology)",
            "Bachelor of Science (Cyber Security)",
            "Bachelor of Science (Environmental Management)",
            "Bachelor of Science (Marine and Freshwater Biology)",
            "Bachelor of Science (Mathematics) Honours",
            "Bachelor of Science (Physics) Honours",
            "Bachelor of Science (Security)",
            "Bachelor of Science (Security) Honours",
            "Bachelor of Science Honours",
            "Bachelor of Science/Bachelor of Commerce",
            "Bachelor of Sustainability",
            "Bachelor of Education (Early Childhood Studies)",
            "Bachelor of Education (Primary)",
            "Bachelor of Education (Secondary)",
            "Bachelor of Arts (Acting)",
            "Bachelor of Arts (Arts Management)",
            "Bachelor of Arts (Arts Management) Honours",
            "Bachelor of Arts (Dance)",
            "Bachelor of Arts (Dance) Honours",
            "Bachelor of Arts (Music Theatre)",
            "Bachelor of Education (Secondary)",
            "Bachelor of Music",
            "Bachelor of Music Honours",
            "Bachelor of Performing Arts",
        ]
        for link in range(len(links)):
            url = links[link]
            programme_dict[url] = programme_list[link]
        item['major_type1'] = programme_dict.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            programme = response.xpath(
                "//h2[contains(text(), 'Bachelor of')]//text()").extract()
            clear_space(programme)
            programme = ''.join(programme).strip()
            item['degree_name'] = programme
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Bachelor", item['degree_name'])
            # print("pre_re: ", pro_re)
            if len(pro_re) == 1:
                programme_re = re.findall(
                    r"\(.+\)",
                    item['degree_name'].replace("Honours",
                                                "").replace("(Advanced)", ""))
                if len(programme_re) > 0:
                    item['programme_en'] = ''.join(programme_re).replace(
                        "(", "").replace(")", "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace(
                        "Bachelor of", "").strip()
                print("item['programme_en']: ", item['programme_en'])

                overview = response.xpath(
                    "//span[@id='overview']/..").extract()
                item['degree_overview_en'] = remove_class(
                    clear_lianxu_space(overview))
                # print("item['degree_overview_en']: ", item['degree_overview_en'])

                entry_requirements = response.xpath(
                    "//div[@id='before-you-start']").extract()
                entry_requirements_str = ''.join(entry_requirements).strip()
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space(entry_requirements))
                # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

                modules = response.xpath(
                    "//h4[contains(text(),'Course structure')]|//div[@class='structure-heading']"
                ).extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # print("item['modules_en']: ", item['modules_en'])

                career = response.xpath(
                    "//h4[contains(text(),'Employment opportunities')]|//h4[contains(text(),'Employment opportunities')]/following-sibling::*[1]|"
                    "//h4[contains(text(),'Possible future job titles')]|//h4[contains(text(),'Possible future job titles')]/following-sibling::*[1]"
                ).extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                if item['career_en'] == "":
                    print("***career_en 为空")
                print("item['career_en']: ", item['career_en'])

                location = response.xpath(
                    "//div[@class='courseOverview__info courseOverview__info--international courseOverview__info--noOnline']//div[@class='studyCampus__location studyCampus__location--joondalup studyCampus__location--active']/h4//text()|"
                    "//div[@class='courseOverview__info courseOverview__info--international courseOverview__info--noOnline']//div[@class='studyCampus__location studyCampus__location--mtLawley studyCampus__location--active']/h4//text()|"
                    "//div[@class='courseOverview__info courseOverview__info--international courseOverview__info--noOnline']//div[@class='studyCampus__location studyCampus__location--bunbury studyCampus__location--active']/h4//text()"
                ).extract()
                clear_space(location)
                location = ','.join(location).strip().strip().strip(
                    ',').strip()
                item['location'] = location
                location_tmp = item['location']
                print("item['location']: ", item['location'])

                duration = response.xpath(
                    "//div[@class='courseOverview__info courseOverview__info--international courseOverview__info--noOnline']//p[contains(text(),'year')]//text()|"
                    "//div[@class='courseOverview__info courseOverview__info--international']//p[contains(text(),'year')]//text()"
                ).extract()
                clear_space(duration)
                print("duration: ", duration)
                duration_re = re.findall(r"Start\sSemester.*",
                                         ''.join(duration).strip())
                print(duration_re, "===")
                item['start_date'] = ','.join(duration_re)
                item['duration'] = ''.join(duration).replace(
                    ''.join(duration_re), "").strip()
                print("item['duration']: ", item['duration'])

                other = response.xpath(
                    "//span[@class='courseOverview__subHeader alert-warning alert']//text()"
                ).extract()
                item['other'] = ''.join(other)
                print("item['other']: ", item['other'])

                # https://www.ecu.edu.au/future-students/course-entry/english-competency
                item[
                    'ielts_desc'] = "An overall band score of 6.0, with no individual band less than 6.0"
                item['ielts'] = "6.0"
                item['ielts_l'] = "6.0"
                item['ielts_s'] = "6.0"
                item['ielts_r'] = "6.0"
                item['ielts_w'] = "6.0"
                item[
                    'toefl_desc'] = "Minimum score of 70, with no individual score less than 17"
                item['toefl'] = "70"
                item['toefl_l'] = "17"
                item['toefl_s'] = "17"
                item['toefl_r'] = "17"
                item['toefl_w'] = "17"

                if "This course is not offered for study on-campus to international students with a student visa" not in item[
                        'other']:
                    major_list_url = response.xpath(
                        "//div[@class='section']//ul[@class='core-units']//a/@href"
                    ).extract()
                    clear_space(major_list_url)
                    print("major_list_url: ", major_list_url)
                    print(len(major_list_url))

                    if len(major_list_url) == 0:
                        item['url'] = response.url
                        print("item['url']2: ", item['url'])
                        yield item
                    else:
                        for major_url in major_list_url:
                            headers_base = {
                                'User-Agent':
                                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
                            }
                            data = requests.get(major_url,
                                                headers=headers_base)
                            response_major = etree.HTML(data.text)
                            item['url'] = major_url
                            print("item['url']_major: ", item['url'])

                            programme_major = response_major.xpath(
                                "//span[@id='overview']/following-sibling::h2//text()"
                            )
                            item['programme_en'] = ''.join(
                                programme_major).strip()
                            print("item['programme_en']_major: ",
                                  item['programme_en'])

                            location_major = response_major.xpath(
                                "//div[@class='studyCampus__location studyCampus__location--active']/h4//text()"
                            )
                            item['location'] = ','.join(
                                location_major).strip().strip(',').strip()
                            if item['location'] == "":
                                item['location'] = location_tmp
                            print("item['location']_major: ", item['location'])

                            overview_en = response_major.xpath(
                                "//span[@id='overview']/..")
                            overview_en_str = ""
                            if len(overview_en) > 0:
                                for o in overview_en:
                                    overview_en_str += etree.tostring(
                                        o, encoding='unicode', method='html')
                            item['overview_en'] = remove_class(
                                clear_lianxu_space([overview_en_str]))
                            print("item['overview_en']_major: ",
                                  item['overview_en'])

                            modules_en = response_major.xpath(
                                "//h4[contains(text(),'Structure')]|//h4[contains(text(),'Course structure')]|//div[@class='structure-heading']"
                            )
                            modules_en_str = ""
                            if len(modules_en) > 0:
                                for o in modules_en:
                                    modules_en_str += etree.tostring(
                                        o, encoding='unicode', method='html')
                            item['modules_en'] = remove_class(
                                clear_lianxu_space([modules_en_str]))
                            print("item['modules_en']_major: ",
                                  item['modules_en'])
                            yield item
        except Exception as e:
            with open("scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: UniversityofTechnologySydney_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "University of Technology Sydney"
        # item['country'] = 'Australia'
        # item['website'] = 'https://www.uts.edu.au'
        item['url'] = response.url
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        item['major_type1'] = response.meta.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            # 2019/03/21 div的class值多了个空格，使用contains
            programme = response.xpath(
                '//div[@class="field-item"]/div[contains(@class,"page-title")]/h1//text()'
            ).extract()
            clear_space(programme)
            programme = ''.join(programme).strip()
            item['degree_name'] = programme
            print("item['degree_name']: ", item['degree_name'])

            de_p = re.findall(r"\(.+\)", item['degree_name'])
            if len(de_p) > 0:
                de_s = ''.join(de_p).strip()
                if de_s != "(Honours)":
                    item['programme_en'] = de_s.replace("(", "").replace(
                        ")", "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace(
                        "Bachelor of", "").replace("(Honours)", "").strip()
            else:
                item['programme_en'] = item['degree_name'].replace(
                    "Bachelor of", "").replace("(Honours)", "").strip()
            pro_re = re.findall(r"in\s.*", item['degree_name'])
            if len(pro_re) > 0:
                de_s = ''.join(pro_re).replace("in", "").strip()
                item['programme_en'] = de_s
            print("item['programme_en']: ", item['programme_en'])

            start_date = response.xpath(
                "//dt[contains(text(),'UAC')]/following-sibling::dd/span//text()"
            ).extract()
            clear_space(start_date)
            print(start_date)
            if len(start_date) > 0:
                start_date_re = re.findall(r"\w+\ssession",
                                           ' '.join(start_date))
                start_date_re = list(set(start_date_re))
                print("start_date_re: ", start_date_re)
                item['start_date'] = ','.join(start_date_re).replace(
                    "(", "").replace(")", "").replace(" session", "").strip()
            print("item['start_date']: ", item['start_date'])

            overview = response.xpath(
                '//div[@class="field field-dddd-view-modeluts-course-course__overview field-type-ds field-label-hidden"]'
            ).extract()
            item['degree_overview_en'] = remove_class(
                clear_lianxu_space(overview))
            # print("item['degree_overview_en']: ", item['degree_overview_en'])

            career = response.xpath(
                '//div[@class="field field-dddd-view-modeluts-course-course__careers field-type-ds field-label-hidden"]'
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            modules = response.xpath(
                "//div[@class='course__structure']").extract()
            if len(modules) == 0:
                print("*********")
                # modules = response.xpath("//div[@class='course__structure']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            print("item['modules_en']: ", item['modules_en'])

            location = response.xpath(
                '//div[@class="field field-dddd-view-modeluts-course-course__location field-type-ds field-label-hidden"]//p//text()'
            ).extract()
            clear_space(location)
            location = ''.join(location).strip()
            item['location'] = location
            # print("item['location']: ", item['location'])

            duration = response.xpath(
                '//div[@class="field field-dddd-view-modeluts-course-course__duration field-type-ds field-label-hidden"]//p//text()'
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            if len(duration) > 0:
                duration = duration[0]
            # print(duration)
            if "or" in duration:
                duration = duration.strip("or").strip()
            mode = re.findall("\w+\stime$", duration)
            # print(mode)
            mode = ''.join(mode)
            # item['mode'] = mode
            # print("item['mode']: ", item['mode'])
            item['duration'] = ''.join(duration.replace(mode, "").strip())
            # print("item['duration']: ", item['duration'])

            # feeDict = {'C04006v6': '15000', 'C04007v7': '15000', 'C04008v6': '15000', 'C04018v6': '19015', 'C04037v6': '17570', 'C04038v8': '18650', 'C04048v7': '18650', 'C04052v4': '19770', 'C04055v4': '15585', 'C04067v7': '18650', 'C04090v5': '17930', 'C04094v5': '17570', 'C04097v2': '17930', 'C04098v3': '17570', 'C04106v5': '16005', 'C04108v3': '14790', 'C04109v7': '14790', 'C04140v11': '16005', 'C04143v8': '20575', 'C04145v4': '20575', 'C04147v5': '22280', 'C04149v4': '21415', 'C04157v8': '19770', 'C04158v4': '19015', 'C04160v7': '20985', 'C04203v4': '14790', 'C04210v1': '16895', 'C04218v5': '19770', 'C04222v1': '19770', 'C04224v4': '20985', 'C04226v4': '17570', 'C04227v3': '17570', 'C04228v2': '16005', 'C04229v3': '17570', 'C04231v2': '15145', 'C04232v3': '15145', 'C04234v1': '19770', 'C04235v2': '17570', 'C04236v3': '22280', 'C04237v3': '18650', 'C04238v3': '18650', 'C04239v2': '14150', 'C04241v2': '18280', 'C04242v1': '20575', 'C04243v3': '17270', 'C04244v1': '13520', 'C04245v1': '14790', 'C04246v2': '16005', 'C04248v1': '16280', 'C04250v2': '22280', 'C04251v1': '20575', 'C04252v2': '19015', 'C04253v2': '19015', 'C04254v1': '14790', 'C04255v1': '12300', 'C04257v1': '11145', 'C04258v3': '18650', 'C04259v2': '18650', 'C04260v2': '18650', 'C04261v2': '18650', 'C04262v1': '14790', 'C04264v1': '22280', 'C04265v2': '18280', 'C04266v1': '17270', 'C04267v1': '18280', 'C04268v1': '13340', 'C04269v2': '13340', 'C04270v1': '17570', 'C04271v2': '17930', 'C04272v2': '17570', 'C04273v2': '17930', 'C04274v1': '17570', 'C04275v1': '17570', 'C04277v2': '17930', 'C04278v2': '17570', 'C04279v2': '16005', 'C04281v2': '18650', 'C04284v2': '14150', 'C04285v1': '15000', 'C04286v1': '18650', 'C04287v1': '18650', 'C04288v1': '15000', 'C04289v1': '18650', 'C04290v1': '15000', 'C04291v1': '14150', 'C04292v1': '16005', 'C04293v2': '17930', 'C04294v1': '15585', 'C04295v2': '19770', 'C04296v2': '19015', 'C04297v2': '19770', 'C04298v1': '14790', 'C04299v1': '18280', 'C04300v1': '18650', 'C04301v1': '15000', 'C04302v1': '16005', 'C04303v1': '16005', 'C04304v3': '19015', 'C04305v1': '14415', 'C04306v1': '25070', 'C04307v1': '14415', 'C04309v2': '17930', 'C04314v1': '18650', 'C04315v1': '15585', 'C04316v2': '15000', 'C04317v1': '15000', 'C04319v1': '15000', 'C04320v1': '22280', 'C04321v1': '16565', 'C04322v1': '16005', 'C04323v1': '15585', 'C04324v2': '18650', 'C04325v2': '18650', 'C04368v1': '16895', 'C04369v1': '16895', 'C04371v1': '16895', 'C04372v1': '17930', 'C04373v1': '18650', 'C06006v5': '15000', 'C06009v8': '19015', 'C06017v7': '15000', 'C06033v4': '11145', 'C06037v4': '16005', 'C06041v6': '14790', 'C06058v7': '19770', 'C06096v3': '14415', 'C06097v1': '16895', 'C06099v1': '20575', 'C06100v2': '19015', 'C06101v1': '14790', 'C06102v1': '14790', 'C06103v1': '14790', 'C06104v1': '16565', 'C06105v1': '14790', 'C06106v1': '14790', 'C06107v1': '13340', 'C06108v1': '17930', 'C06109v1': '17570', 'C06110v1': '17570', 'C06113v1': '19770', 'C06114v2': '17930', 'C06115v2': '15000', 'C06116v1': '14415', 'C06118v2': '25070', 'C06119v1': '15585', 'C06121v1': '15585', 'C06122v1': '19015', 'C06123v1': '19770', 'C06124v1': '17930', 'C07002v7': '15000', 'C07004v4': '15000', 'C07012v7': '18650', 'C07018v5': '18650', 'C07019v6': '15000', 'C07021v8': '18650', 'C07027v8': '14150', 'C07028v9': '14150', 'C07029v7': '15000', 'C07044v4': '16005', 'C07048v7': '16005', 'C07073v5': '22280', 'C07074v5': '22280', 'C07075v4': '18280', 'C07078v3': '19015', 'C07080v7': '20985', 'C07107v3': '13520', 'C07112v4': '18650', 'C07113v3': '18650', 'C07118v1': '14790', 'C07119v1': '17270', 'C07120v2': '16895', 'C07122v1': '22280', 'C07124v1': '16005', 'C07125v1': '14790', 'C07126v1': '16005', 'C07128v1': '18650', 'C07129v1': '18650', 'C07132v1': '18650', 'C11001v5': '15000', 'C11005v5': '15000', 'C11008v7': '19015', 'C11015v8': '18650', 'C11017v5': '17570', 'C11021v5': '18650', 'C11027v5': '18650', 'C11039v4': '18650', 'C11048v3': '17930', 'C11051v3': '17570', 'C11054v2': '17570', 'C11125v4': '20575', 'C11128v3': '16005', 'C11130v4': '20575', 'C11142v7': '19770', 'C11145v7': '20985', 'C11198v3': '18650', 'C11199v4': '18650', 'C11206v3': '18650', 'C11210v2': '17270', 'C11211v2': '22280', 'C11215v4': '11145', 'C11216v1': '18280', 'C11217v1': '20575', 'C11223v1': '14790', 'C11225v1': '17270', 'C11227v1': '16895', 'C11229v1': '20575', 'C11230v2': '19015', 'C11232v1': '18280', 'C11234v1': '17270', 'C11235v1': '13340', 'C11236v1': '17930', 'C11237v1': '17570', 'C11238v1': '17930', 'C11239v1': '17570', 'C11242v1': '16005', 'C11245v1': '15000', 'C11247v1': '19770', 'C11248v1': '17930', 'C11249v2': '15000', 'C11254v1': '14415', 'C11257v1': '15000', 'C11260v2': '25070', 'C11262v1': '16005', 'C11264v1': '22280', 'C11265v1': '20575', 'C11270v1': '15000', 'C11271v1': '15000', 'C11274v1': '17930', 'C01001v2': '12810', 'C01002v2': '12810', 'C01003v2': '12810', 'C01004v2': '12810', 'C01005v2': '12810', 'C02001v2': '13850', 'C02018v5': '17570', 'C02019v3': '12810', 'C02020v2': '12810', 'C02024v4': '16005', 'C02025v5': '13520', 'C02026v4': '13520', 'C02028v6': '15000', 'C02029v4': '16280', 'C02030v3': '18280', 'C02031v3': '18280', 'C02037v4': '12810', 'C02039v3': '13340', 'C02041v4': '12810', 'C02047v1': '16280', 'C02048v4': '16005', 'C02050v1': '12810', 'C02056v1': '15000', 'C02057v1': '16005', 'C02058v2': '16005', 'C02059v1': '15000', 'C02060v1': '15000', 'C02061v1': '16005', 'C02062v1': '16005', 'C02063v1': '15000', 'C03001v4': '13850', 'C03002v5': '13850', 'C03012v4': '13850', 'C03017v5': '17570', 'C03018v3': '12810', 'C03024v7': '15000', 'C03025v4': '16280', 'C03026v6': '18280', 'C03029v4': '18280', 'C03032v4': '12810', 'C03034v3': '13340', 'C03044v2': '12810', 'C03046v3': '16005', 'C03047v2': '12810', 'C03048v3': '16005', 'C03049v3': '16005', 'C03050v3': '16005', 'C03051v1': '16280', 'C03053v1': '15000', 'C03054v1': '15000', 'C03055v1': '16005', 'C03056v1': '15000', 'C03057v1': '15000', 'C03058v1': '16005', 'C03059v1': '15000'}
            feeDict = {}
            cod = [
                "C09004v6",
                "C09018v6",
                "C09019v4",
                "C09020v7",
                "C09022v3",
                "C09023v3",
                "C09026v3",
                "C09029v3",
                "C09031v3",
                "C09035v4",
                "C09046v2",
                "C09047v1",
                "C09048v2",
                "C09049v1",
                "C09050v1",
                "C09052v2",
                "C09055v2",
                "C09056v1",
                "C09057v2",
                "C09059v2",
                "C09060v1",
                "C09061v1",
                "C09063v2",
                "C09064v1",
                "C09066v2",
                "C09067v2",
                "C09068v2",
                "C09069v2",
                "C09070v2",
                "C09071v2",
                "C09072v2",
                "C09073v2",
                "C09074v2",
                "C09075v2",
                "C09076v2",
                "C09077v1",
                "C09078v1",
                "C09079v3",
                "C09081v1",
                "C09082v1",
                "C09083v2",
                "C09084v1",
                "C09085v2",
                "C09086v1",
                "C09087v2",
                "C09088v1",
                "C09089v2",
                "C09091v2",
                "C09093v2",
                "C09094v2",
                "C09095v2",
                "C09096v2",
                "C09097v1",
                "C09098v2",
                "C09099v1",
                "C09101v1",
                "C09119v1",
                "C09120v1",
                "C09121v1",
                "C10004v6",
                "C10007v8",
                "C10011v5",
                "C10020v4",
                "C10021v4",
                "C10026v4",
                "C10027v4",
                "C10039v10",
                "C10040v8",
                "C10044v7",
                "C10045v9",
                "C10054v5",
                "C10055v8",
                "C10056v5",
                "C10059v8",
                "C10061v6",
                "C10062v5",
                "C10063v6",
                "C10065v10",
                "C10066v7",
                "C10067v7",
                "C10068v9",
                "C10073v7",
                "C10074v6",
                "C10075v7",
                "C10076v7",
                "C10078v7",
                "C10079v6",
                "C10098v2",
                "C10115v9",
                "C10122v11",
                "C10123v7",
                "C10124v8",
                "C10125v9",
                "C10126v8",
                "C10129v6",
                "C10131v6",
                "C10132v4",
                "C10136v9",
                "C10137v4",
                "C10148v4",
                "C10152v4",
                "C10155v9",
                "C10157v6",
                "C10158v5",
                "C10162v6",
                "C10163v4",
                "C10164v7",
                "C10167v4",
                "C10169v5",
                "C10172v7",
                "C10174v5",
                "C10184v6",
                "C10186v9",
                "C10206v6",
                "C10208v5",
                "C10209v7",
                "C10214v3",
                "C10215v3",
                "C10219v4",
                "C10223v2",
                "C10224v3",
                "C10226v6",
                "C10227v4",
                "C10228v5",
                "C10229v4",
                "C10239v1",
                "C10242v2",
                "C10243v2",
                "C10244v2",
                "C10245v3",
                "C10246v1",
                "C10247v1",
                "C10248v1",
                "C10250v1",
                "C10251v1",
                "C10252v2",
                "C10253v2",
                "C10254v2",
                "C10255v1",
                "C10256v2",
                "C10257v2",
                "C10258v3",
                "C10259v3",
                "C10260v3",
                "C10261v3",
                "C10262v2",
                "C10263v3",
                "C10264v2",
                "C10265v4",
                "C10266v4",
                "C10269v2",
                "C10270v2",
                "C10271v4",
                "C10272v3",
                "C10273v2",
                "C10274v2",
                "C10275v1",
                "C10276v1",
                "C10277v1",
                "C10300v2",
                "C10301v2",
                "C10302v2",
                "C10303v2",
                "C10304v2",
                "C10305v2",
                "C10306v1",
                "C10307v1",
                "C10308v1",
                "C10309v1",
                "C10310v1",
                "C10311v1",
                "C10312v1",
                "C10313v1",
                "C10314v1",
                "C10315v1",
                "C10316v1",
                "C10317v1",
                "C10318v1",
                "C10319v1",
                "C10320v1",
                "C10321v2",
                "C10322v3",
                "C10323v2",
                "C10324v2",
                "C10325v3",
                "C10326v2",
                "C10327v2",
                "C10328v2",
                "C10330v2",
                "C10332v1",
                "C10333v1",
                "C10334v1",
                "C10335v1",
                "C10336v1",
                "C10337v1",
                "C10338v2",
                "C10339v1",
                "C10341v1",
                "C10342v2",
                "C10343v1",
                "C10345v1",
                "C10346v1",
                "C10347v2",
                "C10348v1",
                "C10349v3",
                "C10350v3",
                "C10351v2",
                "C10352v3",
                "C10353v2",
                "C10354v2",
                "C10355v3",
                "C10356v2",
                "C10359v2",
                "C10360v1",
                "C10361v1",
                "C10362v1",
                "C10363v1",
                "C10364v1",
                "C10365v1",
                "C10366v1",
                "C10367v1",
                "C10368v1",
                "C10369v1",
                "C10370v1",
                "C10371v1",
                "C10372v1",
                "C10373v2",
                "C10374v2",
                "C10375v2",
                "C10376v2",
                "C10377v2",
                "C10378v1",
                "C10379v1",
                "C10380v1",
                "C10381v1",
                "C10382v1",
                "C10383v1",
                "C10384v1",
                "C10385v1",
                "C10386v1",
                "C10387v1",
                "C10388v1",
                "C10389v2",
                "C10390v1",
                "C10391v1",
                "C10392v1",
                "C20049v1",
                "C20056v1",
                "C20059v1",
                "C20060v1 ",
            ]
            fee = [
                "18130",
                "18130",
                "20340",
                "18825",
                "18825",
                "18825",
                "18825",
                "18825",
                "18825",
                "18825",
                "18825",
                "16800",
                "18445",
                "15525",
                "18825",
                "18130",
                "18130",
                "18130",
                "15525",
                "18130",
                "18130",
                "18130",
                "15750",
                "18130",
                "21180",
                "21180",
                "19960",
                "19960",
                "19960",
                "19960",
                "19960",
                "19960",
                "19960",
                "19960",
                "19960",
                "18825",
                "18825",
                "18445",
                "17390",
                "15900",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "17735",
                "18130",
                "20340",
                "21180",
                "21180",
                "18445",
                "15750",
                "15525",
                "18130",
                "15525",
                "18130",
                "17390",
                "16800",
                "16800",
                "15525",
                "15525",
                "17090",
                "18130",
                "15750",
                "18130",
                "21180",
                "16800",
                "16800",
                "19960",
                "19960",
                "21180",
                "19960",
                "19960",
                "16800",
                "16800",
                "19960",
                "16800",
                "16800",
                "14195",
                "18825",
                "18130",
                "18130",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "17735",
                "21180",
                "17735",
                "20340",
                "20340",
                "17735",
                "17735",
                "17735",
                "18825",
                "18825",
                "17735",
                "18825",
                "18825",
                "18825",
                "18825",
                "18825",
                "17735",
                "15900",
                "15900",
                "15900",
                "15750",
                "15750",
                "20340",
                "18825",
                "17735",
                "18130",
                "18825",
                "18825",
                "20340",
                "20340",
                "18825",
                "18825",
                "18825",
                "21180",
                "17390",
                "17390",
                "15135",
                "15135",
                "15135",
                "17390",
                "17390",
                "15135",
                "14195",
                "15135",
                "15135",
                "20340",
                "20340",
                "20340",
                "20340",
                "17735",
                "20340",
                "16085",
                "18130",
                "18130",
                "18445",
                "18445",
                "18130",
                "18130",
                "18130",
                "18130",
                "18825",
                "19190",
                "19190",
                "15525",
                "15525",
                "15525",
                "15525",
                "18130",
                "18130",
                "18130",
                "18130",
                "18130",
                "18130",
                "15750",
                "15135",
                "15135",
                "20340",
                "15135",
                "15135",
                "20340",
                "15135",
                "15135",
                "20340",
                "15750",
                "18130",
                "18130",
                "18130",
                "18130",
                "18445",
                "18130",
                "20340",
                "15525",
                "18825",
                "17390",
                "17390",
                "17390",
                "17390",
                "17390",
                "17390",
                "21180",
                "16800",
                "18445",
                "17390",
                "17390",
                "20340",
                "18825",
                "19190",
                "18130",
                "15900",
                "15900",
                "18130",
                "19190",
                "18825",
                "18825",
                "17390",
                "18130",
                "16800",
                "15525",
                "19190",
                "19190",
                "16800",
                "16800",
                "19190",
                "19190",
                "16800",
                "16800",
                "16800",
                "16800",
                "16800",
                "16800",
                "19190",
                "16800",
                "16800",
                "19190",
                "16800",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "21180",
                "17735",
                "17735",
                "21180",
                "18825",
                "18825",
                "18825",
                "16085",
                "21180",
                "15900",
                "20755",
                "20755",
                "14855",
                "17090",
            ]
            for i in range(len(cod)):
                feeDict[cod[i]] = fee[i]
            # //div[@class='sidebar__info sidebar--info-codes']//dl/dd[1]
            feeIndex = response.xpath(
                "//div[@class='sidebar__info sidebar--info-codes']//dl/dd[1]//text()"
            ).extract()
            clear_space(feeIndex)
            print("---", feeIndex)
            v_re = re.findall(r"version\s\d", ''.join(feeIndex))
            print(v_re, "***")
            if feeIndex:
                feeIndexe = feeIndex[1] + ''.join(v_re).replace(
                    "version ", "v").strip()
                print('===', feeIndexe)
                item['tuition_fee'] = feeDict.get(feeIndexe)
            # feeIndex = ''.join(feeIndex)
            # print(feeIndex)
            # item['tuition_fee'] = feeDict.get(feeIndex)
            print("item['tuition_fee']: ", item['tuition_fee'])

            # //h4[@class='collapsible__title'][contains(text(),'Admission requirements')]/following-sibling::div[1]
            entry_requirements = response.xpath(
                "//h4[@class='collapsible__title'][contains(text(),'Admission requirements')]/following-sibling::div[1]"
            ).extract()
            entry_requirements_str = ''.join(entry_requirements).strip()
            item['rntry_requirements_en'] = remove_class(
                clear_lianxu_space(entry_requirements))
            print("item['rntry_requirements_en']: ",
                  item['rntry_requirements_en'])

            ieltsRe = re.findall(r"IELTS[0-9a-zA-Z:\.,\s]*;",
                                 entry_requirements_str)
            # print("ieltsRe: ", ieltsRe)
            toeflRe = re.findall(r"internet\sbased[0-9a-zA-Z:\.,\s-]*;",
                                 entry_requirements_str)
            # print("toeflRe: ", toeflRe)
            item['ielts_desc'] = ''.join(ieltsRe).strip()
            print("item['ielts_desc']: ", item['ielts_desc'])

            item['toefl_desc'] = ''.join(toeflRe).strip()
            print("item['toefl_desc']: ", item['toefl_desc'])

            # ieltsDict = {"Bachelor of Arts ": "7.5 overall,speaking 8.0,listening 8.0,reading 7.0,writing 7.0",
            #              "Bachelor of Education": "7.5 overall,speaking 8.0,listening 8.0,reading 7.0,writing 7.0",
            #              "Bachelor of Arts ": "7.5 overall,speaking 8.0,listening 8.0,reading 7.0,writing 7.0",
            #              "Bachelor of Education (Honours)": "7.5 overall,speaking 8.0,listening 8.0,reading 7.0,writing 7.0",
            #              "Bachelor of Education ": "7.5 overall,speaking 8.0,listening 8.0,reading 7.0,writing 7.0",
            #              "Bachelor of Arts in International Studies": "7.5 overall,speaking 8.0,listening 8.0,reading 7.0,writing 7.0",
            #              "Bachelor of Design (Honours) in Animation": "7.0 overall,writing 7.0",
            #              "Bachelor of Communication (Honours)": "7.0 overall,writing 7.0",
            #              "Bachelor of Education (Honours) in Primary Education": "7.0 overall,writing 7.0",
            #              "Bachelor of Nursing": "6.5 overall, writing 6.0",
            #              "Bachelor of Nursing ": "6.5 overall, writing 6.0",
            #              "Bachelor of Arts in International Studies": "6.5 overall, writing 6.0", }
            # if item['ielts_desc'] == "":
            #     item['ielts_desc'] = ieltsDict.get(item['degree_name'])
            #     if item['ielts_desc'] is None:
            #         item['ielts_desc'] = "6.5 overall, writing 6.0"
            # # print("item['ielts_desc']: ", item['ielts_desc'])
            #
            # toeflDict = {
            #     "Bachelor of Arts ": "102-109 overall, speaking 23-27, listening 23-27, reading 23-27, writing 24",
            #     "Bachelor of Education": "102-109 overall, speaking 23-27, listening 23-27, reading 23-27, writing 24 ",
            #     "Bachelor of Arts ": "102-109 overall, speaking 23-27, listening 23-27, reading 23-27, writing 24 ",
            #     "Bachelor of Education (Honours)": "102-109 overall, speaking 23-27, listening 23-27, reading 23-27, writing 24 ",
            #     "Bachelor of Education ": "102-109 overall, speaking 23-27, listening 23-27, reading 23-27, writing 24 ",
            #     "Bachelor of Arts in International Studies": "102-109 overall, speaking 23-27, listening 23-27, reading 23-27, writing 24 ",
            #     "Bachelor of Design (Honours) in Animation": "94-101 overall, writing 23  ",
            #     "Bachelor of Communication (Honours)": "94-101 overall, writing 23  ",
            #     "Bachelor of Education (Honours) in Primary Education": "94-101 overall, writing 23 ",
            #     "Bachelor of Nursing": "79-93 overall,writing 21 ",
            #     "Bachelor of Nursing ": "79-93 overall,writing 21 ",
            #     "Bachelor of Arts in International Studies": "79-93 overall,writing 21 ", }
            # if item['toefl_desc'] == "":
            #     item['toefl_desc'] = toeflDict.get(item['degree_name'])
            #     if item['toefl_desc'] is None:
            #         item['toefl_desc'] = "79-93 overall, writing 21"
            # # print("item['toefl_desc']: ", item['toefl_desc'])

            ielts_d = get_ielts(item['ielts_desc'])
            item["ielts"] = ielts_d.get('IELTS')
            item["ielts_l"] = ielts_d.get('IELTS_L')
            item["ielts_s"] = ielts_d.get('IELTS_S')
            item["ielts_r"] = ielts_d.get('IELTS_R')
            item["ielts_w"] = ielts_d.get('IELTS_W')
            print(
                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            department = response.xpath(
                "//div[@class='field field-dddd-view-modeluts-course-course__part-of field-type-ds field-label-hidden']//div[@class='field-item']//p/a/text()"
            ).extract()
            clear_space(department)
            department = ''.join(department).replace("UTS:", "").strip()
            item['department'] = department
            print("item['department']: ", item['department'])

            apply_procces = response.xpath(
                "//h4[contains(text(),'International students')]/..").extract(
                )
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space(apply_procces))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            yield item
        except Exception as e:
            with open("scrapySchool_Australian_yan/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 18

0

Exibir arquivo

Arquivo: BondUniversity_U.py Projeto: histudent/python_spider

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "Bond University"
        # item['country'] = 'Australia'
        # item['website'] = 'https://bond.edu.au'
        item['url'] = response.url
        item['degree_type'] = 1
        item['major_type1'] = response.meta.get(response.url)
        print("===========================")
        print(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            degree_type = response.xpath(
                "//h1[@class='page-title']//text()").extract()
            clear_space(degree_type)
            degree_type = ''.join(degree_type)
            item['degree_name'] = degree_type
            print("item['degree_name']: ", item['degree_name'])
            programme = degree_type
            if "(Business)" in degree_type:
                item['programme_en'] = "Business"
            else:
                item['programme_en'] = degree_type.replace("Bachelor of",
                                                           "").strip()
            print("item['programme_en']: ", item['programme_en'])

            other = response.xpath(
                "//html//article/blockquote[1]//text()").extract()
            item['other'] = clear_lianxu_space(other)
            # print("item['other']: ", item['other'])

            overview = response.xpath(
                "//html//article/section[@class='section'][1]").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # if item['overview_en'] == "":
            #     print("***overview_en为空")
            print("item['overview_en']: ", item['overview_en'])

            degree_description = response.xpath(
                "//div[@id='show-less-0']|//section[@id='accordion-program']/p"
            ).extract()
            item['degree_overview_en'] = remove_class(
                clear_lianxu_space(degree_description))
            # if item['degree_overview_en'] == "":
            #     print("***degree_overview_en为空")
            # print("item['degree_overview_en']: ", item['degree_overview_en'])

            # //html//section[@id='accordion-program']/div[@class='table-responsive']//tr[2]/td[2]
            duration = response.xpath(
                "//strong[contains(text(),'Duration')]/../following-sibling::td[1]//text()"
            ).extract()
            clear_space(duration)
            # print("duration: ", duration)
            duration = ', '.join(duration)
            duration_re = re.findall(r"\d\ssemesters|\d\ssemester", duration)
            if len(duration_re) > 0:
                for d in duration_re:
                    item['duration'] = duration.replace(d, "").replace(
                        "(", "").replace(")", "").strip()
            else:
                item['duration'] = duration.replace("(",
                                                    "").replace(")",
                                                                "").strip()
            print("item['duration']: ", item['duration'])

            start_date = response.xpath(
                "//strong[contains(text(),'Starting semesters')]/../following-sibling::td[1]//text()"
            ).extract()
            clear_space(start_date)
            # print("start_date: ", start_date)
            monthDict = {
                "january": "01",
                "february": "02",
                "march": "03",
                "april": "04",
                "may": "05",
                "june": "06",
                "july": "07",
                "august": "08",
                "september": "09",
                "october": "10",
                "november": "11",
                "december": "12",
                "jan": "01",
                "feb": "02",
                "mar": "03",
                "apr": "04",
                "may": "05",
                "jun": "06",
                "jul": "07",
                "aug": "08",
                "sep": "09",
                "oct": "10",
                "nov": "11",
                "dec": "12",
                "sept": "09",
            }
            std = []
            start_date_re = re.findall(
                r"january|february|march|april|may|june|july|august|september|october|november|december",
                ','.join(start_date), re.I)
            # print(start_date_re)
            if len(start_date_re) > 0:
                for s in start_date_re:
                    std_tmp = monthDict.get(s.lower())
                    if std_tmp is not None:
                        std.append(std_tmp)
            std = list(set(std))
            item['start_date'] = ','.join(std).replace(
                "0", "").strip().strip(",").strip()
            print("item['start_date']: ", item['start_date'])

            career = response.xpath(
                "//div[@id='collapse-field_pgm_prof_out']|//a[@class='collapsed'][contains(text(),'Professional outcomes')]/../../.."
            ).extract()
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # if item['career_en'] == "":
            #     print("***career_en为空")
            print("item['career_en']: ", item['career_en'])

            modules = response.xpath(
                "//div[@id='collapse-field_pgm_str_sub']|//a[@class='collapsed'][contains(text(),'Structure and subjects')]/../../.."
            ).extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            # if item['modules_en'] == "":
            #     print("***modules_en为空")
            # print("item['modules_en']: ", item['modules_en'])

            tuition_fee = response.xpath(
                "//span[contains(@data-prefix,'Program fees 2019:')]//text()|//strong[contains(text(),'Program fees 2019')]/../text()|"
                "//strong[contains(text(),'2019 fees:')]/../text()").extract(
                )  # 2019.03.20 星期三
            clear_space(tuition_fee)
            # print("tuition_fee: ", tuition_fee)
            tuition_fee_re = re.findall(r"\d+,\d+", ''.join(tuition_fee))
            if len(tuition_fee_re) > 0:
                item['tuition_fee'] = tuition_fee_re[0].replace(",",
                                                                "").strip()
            print("item['tuition_fee']: ", item['tuition_fee'])

            entry_requirements = response.xpath(
                "//div[@id='collapse-field_pgm_ent_req']|//a[@data-toggle='collapse'][contains(text(),'Entry requirements')]/../../..|"
                "//h4[contains(text(),'Academic requirements')]/..").extract(
                )  # 2019.03.20 星期三
            item['rntry_requirements_en'] = remove_class(
                clear_lianxu_space(entry_requirements))
            if item['rntry_requirements_en'] == "":
                print("***rntry_requirements_en为空")
            # print("item['rntry_requirements_en']: ", item['rntry_requirements_en'])

            # "https://bond.edu.au/intl/future-students/bond-international/information-international-students/international-english-language-testing-requirements"
            ielt_desc_dict = {
                "Bachelor of Business Law":
                "IELTS score 7.0 No sub score less than 6.5",
                "Bachelor of Jurisprudence":
                "IELTS score 7.0 No sub score less than 6.5",
                "Bachelor of Medical Studies (BMedSt) and the Doctor of Medicine (MD)":
                "IELTS score 7.0 No sub score less than 6.5",
                "Bachelor of Laws":
                "IELTS score 7.0 No sub score less than 6.5",
                "Bachelor of Psychological Science":
                "IELTS score 7.0 No sub score less than 6.5",
                "Bachelor of Psychological Science (Honours)":
                "IELTS score 7.0 No sub score less than 6.5",
                "Bachelor of Arts":
                "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                "Bachelor of Communication":
                "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                "Bachelor of Communication (Business)":
                "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                "Bachelor of Film and Television":
                "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                "Bachelor of Film and Television (3 Year Program)":
                "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                "Bachelor of Global Studies (Sustainability)":
                "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                "Bachelor of International Relations":
                "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                "Bachelor of Interactive Media and Design":
                "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                "Bachelor of Journalism":
                "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                "Bachelor of Social Science":
                "IELTS score 6.5 Writing 6.5, Reading 6.0, Listening 6.0, Speaking 6.0",
                "Bachelor of Architectural Studies":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Construction Management and Quantity Surveying":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Sustainable Environments and Planning":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Arts":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Communication":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Communication (Business)":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Film and Television":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Film and Television (3 Year Program)":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Global Studies (Sustainability)":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of International Relations":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Interactive Media and Design":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Journalism":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Jurisprudence":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Laws":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Social Science":
                "IELTS score 6.5 No sub score less than 6.0",
                "Bachelor of Actuarial Science":
                "IELTS score 6.0 No sub score less than 6.0",
            }
            item['ielts_desc'] = ielt_desc_dict.get(item['degree_name'])
            print("item['ielts_desc']: ", item['ielts_desc'])

            if item['ielts_desc'] is not None:
                ieltlsrw = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
                if len(ieltlsrw) == 2:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[1]
                    item["ielts_s"] = ieltlsrw[1]
                    item["ielts_r"] = ieltlsrw[1]
                    item["ielts_w"] = ieltlsrw[1]
                elif len(ieltlsrw) == 1:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[0]
                    item["ielts_s"] = ieltlsrw[0]
                    item["ielts_r"] = ieltlsrw[0]
                    item["ielts_w"] = ieltlsrw[0]
                elif len(ieltlsrw) == 5:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[2]
                    item["ielts_s"] = ieltlsrw[2]
                    item["ielts_r"] = ieltlsrw[2]
                    item["ielts_w"] = ieltlsrw[1]
            print(
                "item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s "
                % (item['ielts'], item['ielts_l'], item['ielts_s'],
                   item['ielts_r'], item['ielts_w']))

            item['apply_desc_en'] = remove_class(
                clear_lianxu_space([
                    """<section class="section" id="section-8551"> <a id="application" name="application" class="anchor" ></a><h2 class="field field-name-field-title field-type-text field-label-hidden"> Application essentials</h2><div class="panel-group" id="accordion-8552" role="tablist" aria-multiselectable="true"><div class="panel panel-default"><div class="panel-heading"><h4 class="panel-title"> <a class="collapsed" data-toggle="collapse" data-parent="#accordion-8552" href="#collapse-8553"> Application process </a></h4></div><div id="collapse-8553" class="panel-collapse collapse"><div class="panel-body"><h3>Australian students</h3><p>Applications for most Bond programs can be lodged at any time directly to the University. With the exception of the <a href="https://bond.edu.au/intl/future-students/study-bond/search-program/medicine-bond">Medical Program</a>, you do not need to go through QTAC and your Bond application will not affect your QTAC application for other university programs. Apply direct to Bond University via our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>.</p><h3>International students - full degree</h3><p>Apply directly to Bond using our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a> or through a representative in your country.</p><h3>English language students</h3><p>Apply to study English at Bond University College, located on the Bond University campus, by selecting your desired program below:</p><ul><li><a href="https://apply.bond.edu.au/">English for Academic Purposes</a></li><li><a href="https://college.bond.edu.au/apply-english">General English</a></li></ul><h3>Diploma, university preparation or foundation program students</h3><p>You can apply for your academic pathway through our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>.</p><h3>Outbound exchange - Bond students</h3><p>Undergraduate Bond students must have completed two semesters prior to application while postgraduate Bondies can apply from their first semesters. Applications received in first and second semesters will be pending GPA requirements of 65% and above. Find out how to <a href="https://bond.edu.au/intl/future-students/bond-international/semester-abroad-exchange/outbound-bond">apply for exchange</a>.</p><h3>Inbound exchange students</h3><p>If your home institution has a formal exchange agreement with Bond, you will need to apply through them via their outbound exchange student application process. Your university may set certain academic performance standards for you to qualify for the program. Providing you meet their criteria, your home institution will contact Bond to nominate you as an exchange student and we will contact you to advise that your nomination has been successful. You will then be able to apply to Bond using our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>, which must be accompanied by the required documentation. Exchange students pay their regular tuition fees to their home institution – not to Bond University.</p><h3>Study abroad students</h3><p>Firstly obtain approval from your home institution, then apply directly to Bond using our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>; through a study abroad representative in your country; or through your home university if applicable.</p><ul></ul></div></div></div><div class="panel panel-default"><div class="panel-heading"><h4 class="panel-title"> <a class="collapsed" data-toggle="collapse" data-parent="#accordion-8552" href="#collapse-8554"> When can you start? </a></h4></div><div id="collapse-8554" class="panel-collapse collapse"><div class="panel-body"><p>Bond University runs three full semesters each year with intakes in January (Semester 1), May (Semester 2) and September (Semester 3). Our semesters are scheduled to coordinate with the Northern Hemisphere school/university timetables. (You’ll find that most other Australian universities offer only two semesters a year, meaning that you may have to wait until February or July before you can start your international studies.)</p></div></div></div><div class="panel panel-default"><div class="panel-heading"><h4 class="panel-title"> <a class="collapsed" data-toggle="collapse" data-parent="#accordion-8552" href="#collapse-8555"> Admissions criteria </a></h4></div><div id="collapse-8555" class="panel-collapse collapse"><div class="panel-body"><p>Bond University is committed to open and transparent admission processes, and to providing detailed information about the options and entry criteria that are relevant for you. </p><p>Learn more about our <a href="https://bond.edu.au/intl/future-students/study-bond/how-apply/undergraduate-admissions-criteria">undergraduate admissions criteria</a>. If you have further questions or wish to speak to one of our advisors, contact the <a href="https://bond.edu.au/intl/contact#ofs">Office of Future Students</a>.</p><p>For postgraduate study, the entry requirements are unique to each individual program. <a href="https://bond.edu.au/intl/future-students/study-bond/search-program#postgraduate">Search for your program</a> of interest to find out the specific entry requirements. </p></div></div></div><div class="panel panel-default"><div class="panel-heading"><h4 class="panel-title"> <a class="collapsed" data-toggle="collapse" data-parent="#accordion-8552" href="#collapse-8567"> Academic and English language entry requirements </a></h4></div><div id="collapse-8567" class="panel-collapse collapse"><div class="panel-body"><p>In addition to any performance standards stipulated by your home institution, you will also need to meet Bond’s academic and <a href="https://bond.edu.au/intl/future-students/bond-international/information-international-students/english-language-requirements">English language</a> requirements for the study program you have chosen.</p><p>If you need extra instruction, Bond offers <a href="https://college.bond.edu.au/english-at-bond">English classes</a> on campus through Bond University College, as well as a <a href="https://bond.edu.au/intl/program/bond-university-college-foundation-program">Foundation Program</a> to prepare you for university studies in Australia.</p></div></div></div></div></section>"""
                ]))
            item['apply_proces_en'] = remove_class(
                clear_lianxu_space([
                    """<div class="panel-group" id="accordion-8552" role="tablist" aria-multiselectable="true"><div class="panel panel-default"><div class="panel-heading"><h4 class="panel-title"> <a class="collapsed" data-toggle="collapse" data-parent="#accordion-8552" href="#collapse-8553"> Application process </a></h4></div><div id="collapse-8553" class="panel-collapse collapse"><div class="panel-body"><h3>Australian students</h3><p>Applications for most Bond programs can be lodged at any time directly to the University. With the exception of the <a href="https://bond.edu.au/intl/future-students/study-bond/search-program/medicine-bond">Medical Program</a>, you do not need to go through QTAC and your Bond application will not affect your QTAC application for other university programs. Apply direct to Bond University via our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>.</p><h3>International students - full degree</h3><p>Apply directly to Bond using our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a> or through a representative in your country.</p><h3>English language students</h3><p>Apply to study English at Bond University College, located on the Bond University campus, by selecting your desired program below:</p><ul><li><a href="https://apply.bond.edu.au/">English for Academic Purposes</a></li><li><a href="https://college.bond.edu.au/apply-english">General English</a></li></ul><h3>Diploma, university preparation or foundation program students</h3><p>You can apply for your academic pathway through our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>.</p><h3>Outbound exchange - Bond students</h3><p>Undergraduate Bond students must have completed two semesters prior to application while postgraduate Bondies can apply from their first semesters. Applications received in first and second semesters will be pending GPA requirements of 65% and above. Find out how to <a href="https://bond.edu.au/intl/future-students/bond-international/semester-abroad-exchange/outbound-bond">apply for exchange</a>.</p><h3>Inbound exchange students</h3><p>If your home institution has a formal exchange agreement with Bond, you will need to apply through them via their outbound exchange student application process. Your university may set certain academic performance standards for you to qualify for the program. Providing you meet their criteria, your home institution will contact Bond to nominate you as an exchange student and we will contact you to advise that your nomination has been successful. You will then be able to apply to Bond using our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>, which must be accompanied by the required documentation. Exchange students pay their regular tuition fees to their home institution – not to Bond University.</p><h3>Study abroad students</h3><p>Firstly obtain approval from your home institution, then apply directly to Bond using our secure online <a href="https://apply.bond.edu.au/?refURL=/future-students/study-bond/how-apply" target="_blank">Application Form</a>; through a study abroad representative in your country; or through your home university if applicable.</p><ul></ul></div></div></div></div>"""
                ]))
            # print(item)
            # print("+++", "Graduate" not in item['degree_name'])
            if "/" not in item['degree_name'] or "/" not in item[
                    'degree_name'] and "online" not in item[
                        'degree_name'].lower():
                print("++++++++++++")
                major_list = response.xpath(
                    "//a[@id='majors']/following-sibling::div//div/div/h4/a//text()"
                ).extract()
                clear_space(major_list)
                print("major_list: ", major_list)
                print(len(major_list))

                if len(major_list) == 0:
                    yield item
                else:
                    modules_list = response.xpath(
                        "//a[@id='majors']/following-sibling::div//div/div/div"
                    ).extract()
                    print("===", modules_list)
                    print(len(modules_list))
                    if len(modules_list) == len(major_list):
                        for m in range(len(major_list)):
                            item['programme_en'] = major_list[m]
                            item['modules_en'] = remove_class(
                                clear_lianxu_space([modules_list[m]]))
                            print("item['programme_en']: ",
                                  item['programme_en'])
                            yield item

        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)

Exemplo n.º 19

0

Exibir arquivo

    def parse_data(self, response):
        item = get_item(ScrapyschoolAustralianBenItem)
        item['university'] = "Victoria University"
        # item['country'] = 'Australia'
        # item['website'] = 'https://www.vu.edu.au/'
        item['url'] = response.url
        item['degree_type'] = 1
        print("===========================")
        print(response.url)
        pro = [
            "Bachelor of Arts",
            "Bachelor of Biomedical and Exercise Science",
            "Bachelor of Biomedicine",
            "Bachelor of Commerce",
            "Bachelor of Community Development",
            "Bachelor of Construction Management (Honours)",
            "Bachelor of Criminal Justice",
            "Bachelor of Criminal Justice and Psychological Studies",
            "Bachelor of Early Childhood Education",
            "Bachelor of Education Studies",
            "Bachelor of Engineering (Honours) (Electrical and Sports Engineering)",
            "Bachelor of Human Nutrition",
            "Bachelor of Screen Media",
            "Bachelor of Social Work (Honours)",
            "Bachelor of Sport and Exercise Science (Honours)",
            "Bachelor of Sport Science (Human Movement)/Bachelor of Psychological Studies",
            "Bachelor of Youth Work",
        ]
        uu = [
            "https://www.vu.edu.au/courses/international/ABAB",
            "https://www.vu.edu.au/courses/international/HBES",
            "https://www.vu.edu.au/courses/international/HBBM",
            "https://www.vu.edu.au/courses/international/BBCA",
            "https://www.vu.edu.au/courses/international/ABCD",
            "https://www.vu.edu.au/courses/international/NHCM",
            "https://www.vu.edu.au/courses/international/ABCJ",
            "https://www.vu.edu.au/courses/international/ABCY",
            "https://www.vu.edu.au/courses/international/EBEC",
            "https://www.vu.edu.au/courses/international/EBST",
            "https://www.vu.edu.au/courses/international/NHES",
            "https://www.vu.edu.au/courses/international/HBNT",
            "https://www.vu.edu.au/courses/international/ABSN",
            "https://www.vu.edu.au/courses/international/ABSX",
            "https://www.vu.edu.au/courses/international/SHSP",
            "https://www.vu.edu.au/courses/international/SBHP",
            "https://www.vu.edu.au/courses/international/ABYW",
        ]
        programme_dict = {}
        for i in range(len(pro)):
            programme_dict[uu[i]] = pro[i]
        item['major_type1'] = programme_dict.get(response.url)
        print("item['major_type1']: ", item['major_type1'])
        try:
            # //h1[@class='page-header']
            programme = response.xpath(
                "//h1[@class='page-header']//text()").extract()
            clear_space(programme)
            item['degree_name'] = ''.join(programme).strip()
            print("item['degree_name']: ", item['degree_name'])

            pro_re = re.findall(r"Bachelor", item['degree_name'])
            # print("pre_re: ", pro_re)
            if len(pro_re) < 2 and "Graduate" not in item['degree_name']:
                programme_re = re.findall(r"\(.+\)", item['degree_name'])
                if len(programme_re) > 0:
                    if ''.join(programme_re) != "(Honours)":
                        item['programme_en'] = ''.join(programme_re).replace(
                            "(", "").replace(")", "").strip()
                    else:
                        item['programme_en'] = item['degree_name'].replace(
                            "Bachelor of", "").replace("(Honours)",
                                                       "").strip()
                else:
                    item['programme_en'] = item['degree_name'].replace(
                        "Bachelor of", "").strip()
                print("item['programme_en']: ", item['programme_en'])

                department = response.xpath(
                    "//div[@class='field field-name-field-college field-type-link-field field-label-inline clearfix']//div[@class='field-items']//text()"
                ).extract()
                clear_space(department)
                item['department'] = ''.join(department).strip()
                print("item['department']: ", item['department'])

                start_date = response.xpath(
                    "//div[@class='field field-essentials-intake']//div[@class='field-item']//text()|"
                    "//strong[contains(text(),'Intakes:')]/../div//text()"
                ).extract()
                clear_space(start_date)
                print("start_date: ", start_date)
                monthDict = {
                    "january": "01",
                    "february": "02",
                    "march": "03",
                    "april": "04",
                    "may": "05",
                    "june": "06",
                    "july": "07",
                    "august": "08",
                    "september": "09",
                    "october": "10",
                    "november": "11",
                    "december": "12",
                    "jan": "01",
                    "feb": "02",
                    "mar": "03",
                    "apr": "04",
                    "may": "05",
                    "jun": "06",
                    "jul": "07",
                    "aug": "08",
                    "sep": "09",
                    "oct": "10",
                    "nov": "11",
                    "dec": "12",
                    "sept": "09",
                }
                start_date_re = re.findall(
                    r"january|february|march|april|may|june|july|febraugustuary|september|october|november|december",
                    ''.join(start_date), re.I)
                start_date_str = ""
                # print(start_date_re)
                if len(start_date_re) > 0:
                    for s in start_date_re:
                        s1 = monthDict.get(s.lower().strip())
                        if s1 is not None:
                            start_date_str += s1 + ","
                start_date_str = start_date_str.replace(
                    "0", "").strip().strip(',').strip()
                item['start_date'] = start_date_str
                # print("item['start_date']: ", item['start_date'])

                duration = response.xpath(
                    "//div[@class='field field-essentials-duration']//div[@class='field-item']//text()|"
                    "//strong[contains(text(),'Duration:')]/../div//text()"
                ).extract()
                clear_space(duration)
                item['duration'] = ''.join(duration).strip()
                # print("item['duration']: ", item['duration'])

                location = response.xpath(
                    "//div[@class='field field-essentials-locations']//div[@class='field-items']//text()|"
                    "//strong[contains(text(),'Location:')]/../div//text()"
                ).extract()
                clear_space(location)
                item['location'] = ','.join(location).strip()
                # print("item['location']: ", item['location'])

                tuition_fee = response.xpath(
                    "//div[@class='field field-essentials-short-fees']//div[@class='field-item']//text()|"
                    "//strong[contains(text(),'Fees:')]/../div//text()"
                ).extract()
                clear_space(tuition_fee)
                tuition_fee_str = ''.join(tuition_fee).strip()
                tuition_fee_re = re.findall(r"\d+,\d+", tuition_fee_str)
                item['tuition_fee'] = ''.join(tuition_fee_re).replace(
                    ",", "").strip()
                # print("item['tuition_fee']: ", item['tuition_fee'])

                overview = response.xpath("//div[@id='overview']").extract()
                item['overview_en'] = item[
                    'degree_overview_en'] = remove_class(
                        clear_lianxu_space(overview))
                # print("item['degree_overview_en']: ", item['degree_overview_en'])

                career = response.xpath("//div[@id='careers']").extract()
                item['career_en'] = remove_class(clear_lianxu_space(career))
                # print("item['career_en']: ", item['career_en'])

                modules = response.xpath(
                    "//div[@id='course-structure']").extract()
                item['modules_en'] = remove_class(clear_lianxu_space(modules))
                # print("item['modules_en']: ", item['modules_en'])

                entry_requirements = response.xpath(
                    "//html//article/div[4]").extract()
                item['rntry_requirements_en'] = remove_class(
                    clear_lianxu_space(entry_requirements))
                # print("item['rtry_requirements_en']: ", item['rntry_requirements_en'])

                how_to_apply = response.xpath(
                    "//div[@id='apply-now']").extract()
                item['apply_desc_en'] = remove_class(
                    clear_lianxu_space(how_to_apply))
                # print("item['apply_desc_en']: ", item['apply_desc_en'])

                ielts_desc_re = re.findall(r"IELTS.{1,120}",
                                           item['rntry_requirements_en'])
                item['ielts_desc'] = ''.join(ielts_desc_re).strip()
                # print("item['ielts_desc']: ", item['ielts_desc'])

                ieltlsrw = re.findall(r"\d[\d\.]{0,2}", item['ielts_desc'])
                # print(ieltlsrw)
                if len(ieltlsrw) == 1:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[0]
                    item["ielts_s"] = ieltlsrw[0]
                    item["ielts_r"] = ieltlsrw[0]
                    item["ielts_w"] = ieltlsrw[0]
                elif len(ieltlsrw) == 2:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[1]
                    item["ielts_s"] = ieltlsrw[1]
                    item["ielts_r"] = ieltlsrw[1]
                    item["ielts_w"] = ieltlsrw[1]
                elif len(ieltlsrw) == 5:
                    item["ielts"] = ieltlsrw[0]
                    item["ielts_l"] = ieltlsrw[1]
                    item["ielts_s"] = ieltlsrw[4]
                    item["ielts_r"] = ieltlsrw[2]
                    item["ielts_w"] = ieltlsrw[3]
                # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
                #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))
                # item['ielts_desc'] = "Overall score of 6.5 (no band less than 6.0)"
                item['toefl'] = "67"
                item['toefl_l'] = "12"
                item['toefl_s'] = "18"
                item['toefl_r'] = "15"
                item['toefl_w'] = "21"

                major_list = response.xpath(
                    "//div[contains(@class,'field-item even')]//h3[contains(text(),'Majors')]/../../../../../following-sibling::div[contains(@id,'accordion')]//h3/a//text()"
                ).extract()
                clear_space(major_list)
                print("major_list: ", major_list)
                print("===", clear_lianxu_space(major_list))
                if major_list:
                    for major in major_list:
                        item['programme_en'] = major.strip()
                        yield item
                else:
                    yield item
        except Exception as e:
            with open("scrapySchool_Australian_ben/error/" +
                      item['university'] + str(item['degree_type']) + ".txt",
                      'a',
                      encoding="utf-8") as f:
                f.write(
                    str(e) + "\n" + response.url +
                    "\n========================\n")
            print("异常：", str(e))
            print("报错url：", response.url)