def parse_fee(self, response):
     overview = response.meta['overview']
     modules = response.meta['modules']
     department = response.meta['department']
     ielts = response.meta['ielts']
     rntry_requirements = response.meta['rntry_requirements']
     tuition_fee = getTuition_fee(
         response.xpath('//section[@class="content"]').extract())
     toefl = response.meta['toefl']
     # print(tuition_fee)
     apply_url = response.url.replace('fees', 'applying')
     yield scrapy.Request(apply_url,
                          meta={
                              'overview': overview,
                              'ielts': ielts,
                              'toefl': toefl,
                              'department': department,
                              'modules': modules,
                              'rntry_requirements': rntry_requirements,
                              'tuition_fee': tuition_fee
                          },
                          callback=self.parse_apply)
    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'The University of Adelaide'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en_a = response.xpath(
            '//*[@id="ua-main-content"]/h2/text()').extract()
        programme_en_a = ''.join(programme_en_a)
        programme_en = remove_tags(programme_en_a).replace('Bachelor of',
                                                           '').strip()
        if '(' in programme_en:
            programme_en = re.findall(r'\((.*)\)', programme_en)[0]
        # print(programme_en)

        #programme_en 荣誉年
        # programme_en_a = response.xpath('//*[@id="ua-main-content"]/h2/text()').extract()
        # programme_en_a = ''.join(programme_en_a)
        # programme_en = remove_tags(programme_en_a)
        # if 'Honours Degree of Bachelor of ' in programme_en:
        #     programme_en = programme_en.replace('Honours Degree of Bachelor of ','')
        # elif 'Bachelor of 'in programme_en:
        #     programme_en = programme_en.replace('Bachelor of ','')
        # print(programme_en)

        #4.degree_type
        degree_type = 1

        #5.degree_name
        degree_name = programme_en_a
        # print(degree_name)

        #6.modules_en
        modules_en = response.xpath(
            "//*[contains(text(),'Example Study Plan')]//following-sibling::*"
        ).extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        if 'table' not in modules_en:
            modules_en = None
        # try:
        #     modules_en_url = response.xpath("//*[contains(text(),'Academic Program Rules')]//following-sibling::div//@href").extract()[-1]
        # except:
        #     modules_en_url = ''
        # # print(modules_en_url,'**********',url)
        # print(modules_en)

        #7.duration #8.duration_per
        duration_list = response.xpath(
            '//*[@id="ua-main-content"]/div[2]/div[3]/span[2]').extract()
        duration_list = ''.join(duration_list)
        duration_list = remove_tags(duration_list)
        if '1.5' in duration_list:
            duration = 1.5
        else:
            try:
                duration = re.findall('\d', duration_list)[0]
            except:
                duration = None
        duration_per = 1
        # print(duration)
        # print(duration_list)

        #9.location
        location = response.xpath(
            '//*[@id="ua-main-content"]/div[2]/div[1]/span[2]/a').extract()
        location = ''.join(location)
        location = remove_tags(location)
        if '2019/hd' in response.url:
            location = 'North Terrace Campus'
        elif len(location) == 0:
            location = 'Online'
        # print(location)

        #10.overview_en
        overview_en = response.xpath(
            '//*[@id="ua-main-content"]/div[2]/div/div').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        # print(overview_en)

        #11.ielts 12131415
        ielts_list = response.xpath(
            '//*[@id="df-acc-admission"]/div[5]/table[2]//tr[2]/td/table//tr/td'
        ).extract()
        # ielts_list = ''.join(ielts_list)
        # ielts_list = remove_tags(ielts_list)
        # print(ielts_list)

        #ielts
        try:
            if '7' in ielts_list[1]:
                ielts = 7
            else:
                try:
                    ielts = re.findall('\d\.\d', ielts_list[1])[0]
                except:
                    ielts = None
        except:
            ielts = 6.5

        #ielts_r
        try:
            if '6.5' in ielts_list[2]:
                ielts_r = 6.5
            else:
                try:
                    ielts_r = re.findall('\d', ielts_list[2])[0]
                except:
                    ielts_r = None
        except:
            ielts_r = 6
        # print(ielts_r)

        #ielts_l
        try:
            if '6.5' in ielts_list[3]:
                ielts_l = 6.5
            else:
                try:
                    ielts_l = re.findall('\d', ielts_list[3])[0]
                except:
                    ielts_l = None
        except:
            ielts_l = 6
        # print(ielts_l)

        # ielts_s
        try:
            if '6.5' in ielts_list[4]:
                ielts_s = 6.5
            else:
                try:
                    ielts_s = re.findall('\d', ielts_list[4])[0]
                except:
                    ielts_s = None
        except:
            ielts_s = 6
        # print(ielts_s)

        # ielts_w
        try:
            if '6.5' in ielts_list[5]:
                ielts_w = 6.5
            else:
                try:
                    ielts_w = re.findall('\d', ielts_list[5])[0]
                except:
                    ielts_w = None
        except:
            ielts_w = 6
        # print(ielts_w)
        # print(ielts,ielts_r,ielts_w,ielts_s,ielts_l)
        #16.toefl 17181920
        toefl_list = response.xpath(
            '//*[@id="df-acc-admission"]/div[5]/table[2]//tr[3]/td/table//tr/td'
        ).extract()
        toefl_list = ''.join(toefl_list)
        toefl_list = remove_tags(toefl_list)
        # print(toefl_list)
        try:
            toefl = re.findall('\d+', toefl_list)
            # print(toefl)
            a = toefl[0]
            b = toefl[1]
            c = toefl[2]
            d = toefl[3]
            e = toefl[4]
            toefl = a
            toefl_r = b
            toefl_l = c
            toefl_s = d
            toefl_w = e
        except:
            toefl = 94
            toefl_r = 24
            toefl_l = 24
            toefl_s = 23
            toefl_w = 27
        # print(toefl, toefl_r, toefl_l, toefl_s, toefl_w)

        #21.rntry_requirements_en
        rntry_requirements_en = response.xpath(
            '//*[@id="df-acc-admission"]/div[5]/table[3]//tr/td').extract()
        rntry_requirements_en = ''.join(rntry_requirements_en)
        rntry_requirements_en = remove_class(rntry_requirements_en)
        # print(rntry_requirements_en)

        #22.apply_proces_en
        apply_proces_en = 'https://international.adelaide.edu.au/admissions/how-to-apply'

        #23.deadline
        if 'Medicine and  Surgery' in programme_en:
            deadline = '2018-6-30,2019-5-1'
        elif 'Dental Surgery' in programme_en:
            deadline = '2018-6-30,2019-5-1'
        elif 'Oral Health' in programme_en:
            deadline = '2018-6-30,2019-5-1'
        elif 'Nursing' in programme_en:
            deadline = '2018-9-30,2019-5-1'
        elif 'Science (Veterinary Bioscience)' in programme_en:
            deadline = '2018-9-30,2019-5-1'
        else:
            deadline = '2018-12-1,2019-5-1'

        #24.tuition_fee
        tuition_fee = response.xpath(
            '//*[@id="df-acc-fees_scholarships"]/div[5]/table//tr/td[2]'
        ).extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee)

        #25.tuition_fee_pre
        tuition_fee_pre = '$'

        #26.apply_pre
        apply_pre = '$'

        #27.career_en
        career_en = response.xpath(
            '//*[@id="df-acc-careers_parent"]//following-sibling::*').extract(
            )
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['location'] = location
        item['overview_en'] = overview_en
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_s'] = ielts_s
        item['ielts_l'] = ielts_l
        item['toefl'] = toefl
        item['toefl_r'] = toefl_r
        item['toefl_s'] = toefl_s
        item['toefl_w'] = toefl_w
        item['toefl_l'] = toefl_l
        item['rntry_requirements_en'] = rntry_requirements_en
        item['apply_proces_en'] = apply_proces_en
        item['deadline'] = deadline
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['career_en'] = career_en
        item['apply_pre'] = apply_pre
        item['modules_en'] = modules_en
        yield item
    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'London Metropolitan University'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.ucascode
        ucascode = response.xpath(
            "//*[contains(text(),'UCAS code:')]//following-sibling::*"
        ).extract()
        ucascode = ''.join(ucascode)
        ucascode = remove_tags(ucascode)
        ucascode = clear_space_str(ucascode)
        # print(ucascode)

        #4.programme_en
        programme_en = response.xpath(
            '//*[@id="MainContent"]/div[1]/h1').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        # print(programme_en)

        #5.degree_type
        degree_type = 1

        #6.degree_name
        # degree_name = re.findall(r'-\s(.*)',programme_en)[0]
        # programme_en = programme_en.replace(degree_name,'').replace('-','').strip()
        # print(degree_name)
        # print(programme_en)

        #7.alevel
        alevel = response.xpath(
            '//*[@id="entry-requirements"]/div/ul/li[1]').extract()
        alevel = ''.join(alevel)
        alevel = remove_tags(alevel)
        alevel = clear_space_str(alevel)
        # print(alevel)

        #8.overview_en
        overview_en = response.xpath(
            '//*[@id="LeftColumn"]/section/p').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        overview_en = clear_space_str(overview_en)
        # print(overview_en)

        #9.start_date
        start_date = '2018-8-18'

        #10.apply_pre
        apply_pre = '£'

        #11.duration
        try:
            duration = response.xpath(
                "//*[contains(text(),'September 2019 - Full-time')]//@data-duration"
            ).extract()[0]
            duration = ''.join(duration)
            if len(duration) == 0:
                duration = response.xpath(
                    "//*[contains(text(),'September 2018 - Full-time')]//@data-duration"
                )[0]
                duration = ''.join(duration)
        except:
            duration = ''
        # print(duration,response.url)

        #12.tuition_fee
        tuition_fee = response.xpath(
            "//*[contains(text(),'September 2019 - Full-time')]//@data-cost"
        ).extract()
        tuition_fee = ''.join(tuition_fee)
        if len(tuition_fee) == 0:
            tuition_fee = response.xpath(
                "//*[contains(text(),'September 2018 - Full-time')]//@data-cost"
            ).extract()
            tuition_fee = ''.join(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee)

        #13.location
        # location = response.xpath("//*[contains(text(),'Location')]//following-sibling::*[1]").extract()[0]
        # location = ''.join(location)
        # location = remove_tags(location).replace('Location:','').strip()
        # print(location)

        #14.apply_documents_en
        apply_documents_en = '<p>We welcome applications from all suitably qualified prospective students and want to recruit students with the very best academic merit, potential and motivation, irrespective of their background. We carefully consider each application on an individual basis, taking into account all the information presented on your application form, including your: academic achievement (including predicted and achieved grades) personal statement two references CV</p>'

        #15.modules_en
        modules_en = response.xpath('//*[@id="modular-structure"]').extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en).replace('▼', '')
        # modules_en = clear_space_str(modules_en)
        # print(modules_en,url)

        #16.assessment_en
        assessment_en = response.xpath(
            "//h3[contains(text(),'Assessment')]//following-sibling::p[1]"
        ).extract()
        assessment_en = ''.join(assessment_en)
        assessment_en = remove_class(assessment_en)
        # assessment_en = clear_space_str(assessment_en)
        # print(assessment_en)

        #17.career_en
        career_en = response.xpath(
            '//*[@id="career-opportunities"]/div').extract()
        career_en = ''.join(career_en)
        # career_en = clear_space_str(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        #18.ielts,19.20.21.22
        # if 'LLB' in degree_name:
        #     ielts = 6.5
        #     ielts_r = 6.0
        #     ielts_l = 6.0
        #     ielts_w = 6.0
        #     ielts_s = 6.0
        # elif 'BA Translation Year 2 entry' in degree_name:
        #     ielts = 6.5
        #     ielts_r = 6.0
        #     ielts_l = 6.0
        #     ielts_w = 6.0
        #     ielts_s = 6.0
        # elif 'BA Translation Year 3 entry' in degree_name:
        #     ielts = 7
        #     ielts_r = 6.5
        #     ielts_l = 6.5
        #     ielts_w = 6.5
        #     ielts_s = 6.5
        # elif 'BSc Biomedical Science (Leading to MD)' in degree_name:
        #     ielts = 7
        #     ielts_r = 6.5
        #     ielts_l = 6.5
        #     ielts_w = 6.5
        #     ielts_s = 6.5
        # else:
        #     ielts = 6
        #     ielts_r = 5.5
        #     ielts_l = 5.5
        #     ielts_w = 5.5
        #     ielts_s = 5.5

        #23.require_chinese_en
        require_chinese_en = "https://www.londonmet.ac.uk/international/international-admissions/application-guidance-and-entry-criteria/academic-entry-requirements-by-country/non-eueea-countries/china/"

        #24.apply_proces_en
        apply_proces_en = '<p>Stage 1: choosing your course The first step for you as a new applicant is to choose the course you wish to undertake. If you have any questions at this stage you can contact our international recruitment team who will be happy to assist you and provide information about our courses. You can begin a conversation about a course you are interested in by emailing our recruitment team at: [email protected]. We often have representatives of London Metropolitan University visiting countries all around the world. You can find out the latest planned trips to see if we will be visiting near you, here: Meet us overseas  Stage 2: applying for your course Once you have decided on your course you need to submit an application as soon as possible making sure you observe the international application deadlines. The method of application depends on the type of course you are applying for. The application methods available for each course are listed on the course page. You should check these details carefully to avoid any delay in your application reaching us.You should observe our international application guidance before submitting an application. Please see here: International application advice Stage 3: awaiting and responding to your offer Once the University receives your application you will receive a communication from us acknowledging this. You will also obtain your London Metropolitan University application ID and details about using, the applicant portal (Evision). At this point your application will enter the pending decision/consideration stage, and we will communicate with you again, either to request more information (such as a qualification transcript, portfolio, or piece of written work) for assessment, or to advise you of our decision.If you are successful in receiving an offer from us you will receive a communication detailing a conditional or unconditional offer, and this will contain further information and instruction. If your application is unsuccessful we will also contact you advising you of this, and our reasons for the decision. You can find out more about offers here: Information and advice for offer holders.Stage 4: Immigration and enrolment After obtaining an unconditional offer you will need to focus on making preparations to join the university and your arrangements to come to the UK (if you are not already here). You will receive further information about when and where to arrive, and how to attend your course enrolment closer to the enrolment period of your course.You should be considering your accommodation and finances as soon as possible before the start of term, and you should also be aware of, and be prepared to meet, any immigration requirements such as obtaining a student visa at the earliest opportunity. You can find a variety of information about moving to London here: Immigration and Arrival Advice: New Students.</p>'

        #26.tuition_fee_pre
        tuition_fee_pre = '£'

        item['apply_pre'] = apply_pre
        item['apply_documents_en'] = apply_documents_en
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        # item['degree_name'] = degree_name
        item['overview_en'] = overview_en
        item['start_date'] = start_date
        item['duration'] = duration
        item['tuition_fee'] = tuition_fee
        # item['location'] = location
        item['modules_en'] = modules_en
        item['assessment_en'] = assessment_en
        item['career_en'] = career_en
        # item['ielts'] = ielts
        # item['ielts_r'] = ielts_r
        # item['ielts_w'] = ielts_w
        # item['ielts_s'] = ielts_s
        # item['ielts_l'] = ielts_l
        item['require_chinese_en'] = require_chinese_en
        item['apply_proces_en'] = apply_proces_en
        item['tuition_fee_pre'] = tuition_fee_pre
        item['alevel'] = alevel
        item['ucascode'] = ucascode
        yield item
    def parse_data(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        item['university'] = "University of Bristol"
        # items['country'] = "England"
        # items["website"] = "https://www.bristol.ac.uk/"
        item['url'] = response.url
        # 授课方式
        # item['teach_type'] = 'taught'
        # 学位类型
        item['degree_type'] = 2
        print("===========================")
        print(response.url)
        try:
            # 专业
            course = response.xpath("//h1[@id='pagetitle']/span//text()").extract()
            # print("course = ", course)
            item['programme_en'] = ''.join(course).replace("\n", " ").replace("\r", " ").strip()
            print("item['programme_en']: ", item['programme_en'])

            # degreeaward
            degreeaward = response.xpath("//th[contains(text(),'Awards available')]/following-sibling::td[1]//text()").extract()
            # print("degreeaward = ", degreeaward)
            item['degree_name'] = clear_space_str(''.join(degreeaward))
            print("item['degree_name']: ", item['degree_name'])

            if "phd" in item['degree_name'].lower() or "md" in item['degree_name'].lower():
                item['teach_type'] = "phd"
                if "research" in item['degree_name'].lower():
                    item['teach_type'] += " " + "research"
                item['degree_type'] = 3
            elif "research" in item['degree_name'].lower():
                item['teach_type'] = "research"
                item['degree_type'] = 3
            else:
                item['teach_type'] = "taught"
                item['degree_type'] = 2
            # print("item['degree_type']: ", item['degree_type'])
            # print("item['teach_type']: ", item['teach_type'])

            # duration
            duration = response.xpath("//th[@scope='row'][contains(text(),'Programme length')]/following-sibling::td[1]//text()").extract()
            clear_space(duration)
            # print("duration: ", duration)
            item['teach_time'] = getTeachTime(''.join(duration))
            # print("item['teach_time']: ", item['teach_time'])

            duration_list = getIntDuration(''.join(duration))
            if len(duration_list) == 2:
                item['duration'] = duration_list[0]
                item['duration_per'] = duration_list[1]
            # print("item['duration']: ", item['duration'])
            # print("item['duration_per']: ", item['duration_per'])

            # location
            location = response.xpath("//th[@scope='row'][contains(text(),'Location of programme')]/following-sibling::td[1]//text()").extract()
            # print("location = ", location)
            item['location'] = clear_space_str(''.join(location))
            # print("item['location']: ", item['location'])

            # startdate
            startdate = response.xpath("//th[@scope='row'][contains(text(),'Start date')]/following-sibling::td[1]//text()").extract()
            clear_space(startdate)
            print("startdate = ", startdate)
            if len(startdate) > 0:
                # item['start_date'] = startdate[-1].strip()
                # print("item['start_date']: ", item['start_date'])
                item['start_date'] = getStartDate(''.join(startdate[-1]))
            print("item['start_date'] = ", item['start_date'])

            # deadline
            deadline = response.xpath("//div[@id='apply']/div[@class='apply-deadline']/p[1]//text()").extract()
            # print("deadline = ", deadline)
            item['deadline'] = getStartDate(''.join(deadline))
            # print("item['deadline']: ", item['deadline'])

            # department
            department = response.xpath("//div[@id='contact']/p[@class='pg-contact-address']/text()").extract()
            clear_space(department)
            # print("department1 = ", department)
            for d in department:
                if "School" in d or "Faculty" in d:
                    item['department'] = d
            # print("item['department']: ", item['department'])
            if item['department'] == "":
                allcontent = response.xpath("//main[@class='content']//text()").extract()
                clear_space(allcontent)
                department_re = re.findall(r"School\sof.{1,30}", ''.join(allcontent), re.I)
                # print("department_re: ", department_re)
                if len(department_re) > 0:
                    item['department'] = department_re[0].strip()
            # print("item['department']1: ", item['department'])

            # overview  //div[@id='programme-overview']//text()
            overview = response.xpath("//div[@id='programme-overview']|//div[@id='pgr-overview']").extract()
            item['overview_en'] = remove_class(clear_lianxu_space(overview))
            # print("item['overview_en']: ", item['overview_en'])

            # tuitionFee   //div[@id='fees']
            tuitionFee = response.xpath("//dt[contains(text(),'Overseas: full-time')]/following-sibling::dd[1]//text()").extract()
            clear_space(tuitionFee)
            print("tuitionFee = ", tuitionFee)
            if len(tuitionFee) > 0:
                item['tuition_fee_pre'] = "£"
                item['tuition_fee'] = int(''.join(tuitionFee[0]).replace("£", "").replace(",", "").strip())

            if item['tuition_fee'] is None:
                tuitionFee1 = response.xpath(
                    "//dl//dt[contains(text(),'Overseas:')]/following-sibling::dd[1]//text()").extract()
                clear_space(tuitionFee1)
                print("tuitionFee1 = ", tuitionFee1)
                if len(tuitionFee1) > 0:
                    item['tuition_fee_pre'] = "£"
                    item['tuition_fee'] = getTuition_fee(''.join(tuitionFee1))
                if item['tuition_fee'] == 0:
                    item['tuition_fee_pre'] = ""
                    item['tuition_fee'] = None
            if item['tuition_fee'] is None:
                print("tuition_fee 为空")
            print("item['tuition_fee_pre']: ", item['tuition_fee_pre'])
            print("item['tuition_fee']: ", item['tuition_fee'])

            # modules   //div[@id='programme-structure']
            modules = response.xpath("//div[@id='programme-structure']|//div[@id='pgr-research-groups']").extract()
            item['modules_en'] = remove_class(clear_lianxu_space(modules))
            print("item['modules_en']: ", item['modules_en'])

            # 学术要求本科特殊专业要求、IELTS
            entryRequirements = response.xpath("//div[@id='entry-requirements']//text()").extract()
            item['rntry_requirements'] = clear_lianxu_space(entryRequirements)
            # print("item['rntry_requirements']: ", item['rntry_requirements'])

            ielts = response.xpath("//*[contains(text(),'Profile')]//text()|//div[contains(text(),'IELTS')]//text()").extract()
            item['ielts_desc'] = clear_lianxu_space(ielts)
            # print("item['ielts_desc']: ", item['ielts_desc'])

            if item['ielts_desc'] == "Profile A":
                item['ielts'] = 7.5
                item['ielts_l'] = 7.0
                item['ielts_s'] = 7.0
                item['ielts_r'] = 7.0
                item['ielts_w'] = 7.0
                item['toefl'] = 109
                item['toefl_l'] = 25
                item['toefl_r'] = 25
                item['toefl_s'] = 25
                item['toefl_w'] = 29
            elif item['ielts_desc'] == "Profile B":
                item['ielts'] = 7.0
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.5
                item['ielts_w'] = 6.5
                item['toefl'] = 100
                item['toefl_l'] = 24
                item['toefl_r'] = 24
                item['toefl_s'] = 24
                item['toefl_w'] = 24
            elif item['ielts_desc'] == "Profile C":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.5
                item['ielts_w'] = 6.5
                item['toefl'] = 92
                item['toefl_l'] = 23
                item['toefl_r'] = 23
                item['toefl_s'] = 23
                item['toefl_w'] = 24
            elif item['ielts_desc'] == "Profile D":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.0
                item['ielts_s'] = 6.0
                item['ielts_r'] = 7.0
                item['ielts_w'] = 7.0
                item['toefl'] = 92
                item['toefl_l'] = 21
                item['toefl_r'] = 21
                item['toefl_s'] = 21
                item['toefl_w'] = 27
            elif item['ielts_desc'] == "Profile E":
                item['ielts'] = 6.5
                item['ielts_l'] = 6.0
                item['ielts_s'] = 6.0
                item['ielts_r'] = 6.0
                item['ielts_w'] = 6.0
                item['toefl'] = 90
                item['toefl_l'] = 20
                item['toefl_r'] = 20
                item['toefl_s'] = 20
                item['toefl_w'] = 20
            elif item['ielts_desc'] == "Profile F":
                item['ielts'] = 6.0
                item['ielts_l'] = 6.5
                item['ielts_s'] = 6.5
                item['ielts_r'] = 6.0
                item['ielts_w'] = 6.0
                item['toefl'] = 86
                item['toefl_l'] = 20
                item['toefl_r'] = 20
                item['toefl_s'] = 20
                item['toefl_w'] = 23
            elif "Profile" not in item['ielts_desc']:
                ieltsDict = get_ielts(item['ielts_desc'])
                item['ielts'] = ieltsDict.get("IELTS")
                item['ielts_l'] = ieltsDict.get("IELTS_L")
                item['ielts_s'] = ieltsDict.get("IELTS_S")
                item['ielts_r'] = ieltsDict.get("IELTS_R")
                item['ielts_w'] = ieltsDict.get("IELTS_W")
            # print("item['ielts'] = %s item['ielts_l'] = %s item['ielts_s'] = %s item['ielts_r'] = %s item['ielts_w'] = %s " % (
            #         item['ielts'], item['ielts_l'], item['ielts_s'], item['ielts_r'], item['ielts_w']))
            # print("item['toefl'] = %s item['toefl_l'] = %s item['toefl_s'] = %s item['toefl_r'] = %s item['toefl_w'] = %s " % (
            #       item['toefl'], item['toefl_l'], item['toefl_s'], item['toefl_r'], item['toefl_w']))

            # 就业    //div[@id='careers']
            career = response.xpath("//div[@id='careers']").extract()
            # print("department = ", department)
            item['career_en'] = remove_class(clear_lianxu_space(career))
            # print("item['career_en']: ", item['career_en'])

            require_chinese_en = """<h2 id="pgentryreqs">Entry requirements for postgraduate programmes</h2>
<p>You should&nbsp;<a href="/pg-howtoapply/">apply online</a>&nbsp;for all our postgraduate programmes.</p>
<p>To be considered for admission to postgraduate study at the University of Bristol, the minimum requirement for entry is an undergraduate (Bachelor&rsquo;s) degree that is equivalent to a UK Upper Second Class degree (also known as a 2:1). Please refer to the <a href="http://www.bristol.ac.uk/study/postgraduate/admissions-statements/%20%20%20" target="_blank">Postgraduate Admissions Statements</a> for each programme for individual entry requirements.</p>
<ul>
<li>Applicants who hold a 4-year Bachelor's (Honours) degree from a prestigious university with a minimum of 80% will be considered for admission to a Master's degree.</li>
<li>Applicants who hold a good Master's degree from a prestigious university will be considered for admission to PhD study.</li>
<li>Applicants will be required to meet the English language requirements for the programme. The profile level requirements can be found on the&nbsp;<a href="http://www.bristol.ac.uk/study/language-requirements/" target="_blank">English language requirements for study</a>&nbsp;page.</li>
</ul>"""
            item["require_chinese_en"] = remove_class(require_chinese_en)
            # print("item['require_chinese_en']: ", item['require_chinese_en'])

            # http://www.bristol.ac.uk/study/postgraduate/apply/
            item['apply_proces_en'] = remove_class(clear_lianxu_space(["""<p>We offer an online application system for all of our programmes, except the Postgraduate Certificate in Education for which you should <a href="https://www.ucas.com/ucas/teacher-training/ucas-teacher-training-apply-and-track">apply through UCAS</a>.</p>
<p>You can use our online admissions system to:</p>
<ul>
<li>submit all your application details securely online and view your completed application form;</li>
<li>upload supporting documents;</li>
<li>request references electronically;</li>
<li>track the progress of your application;</li>
<li>receive a decision on your application online;</li>
<li>update your contact details (it is important you tell us if you change your home address or email);</li>
<li>receive useful information about the University and your application.</li>
</ul>
<p>If you are unable to make an online application, please contact the Enquiries team on <a href="mailto:[email protected]">[email protected]</a>.</p>"""]))
            # print("item['apply_proces_en']: ", item['apply_proces_en'])

            apply_documents_en = response.xpath("//h3[contains(text(),'English language requirements')]/preceding-sibling::*[position()<last()]").extract()
            item["apply_documents_en"] = remove_class(clear_lianxu_space(apply_documents_en))
            print("item['apply_documents_en']: ", item['apply_documents_en'])
            yield item
        except Exception as e:
            print("异常:", str(e))
            print("报错链接:", response.url)
            with open("scrapySchool_England/error/" + item['university'] + str(item['degree_type']) + ".txt", 'a+', encoding="utf-8") as f:
                f.write(str(e) + "\n" + response.url + "\n========================\n")
Пример #5
0
    def parse(self, response):
        print(response.url)
        item = get_item1(ScrapyschoolEnglandItem1)

        item['university'] = 'Ulster University'
        item['url'] = response.url
        item['location'] = 'Belfast'
        item['teach_time'] = '1'

        programme = response.xpath('//h1//text()').extract()
        programme = ''.join(programme).strip()
        # print(programme)
        degr = re.findall('-.+', programme)
        degr = ''.join(degr)
        # print(degr)
        programme = programme.replace(degr, '').replace('*', '').strip()
        degr = degr.replace('-', '').strip()
        # print(degr)
        # print(programme)
        item['programme_en'] = programme
        item['degree_name'] = degr
        try:
            if degr[0] == 'M':
                item['degree_type'] = '2'
            elif degr[0] == 'P':
                item['degree_type'] = '3'
        except:
            pass

        overview = response.xpath(
            '//h2[contains(text(),"Overview")]/following-sibling::*').extract(
            )
        overview = remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        modules = response.xpath('//div[@id="modules"]').extract()
        modules = remove_class(modules)
        # print(modules)
        item['modules_en'] = modules

        rntry = response.xpath('//div[@id="entryconditions"]').extract()
        rntry = remove_class(rntry)
        item['rntry_requirements'] = rntry

        career = response.xpath('//div[@id="opportunities"]').extract()
        career = remove_class(career)
        item['career_en'] = career

        start_date = response.xpath(
            '//h3[contains(text(),"Start dates")]/following-sibling::*//text()'
        ).extract()
        start_date = tracslateDate(start_date)
        start_date = set(start_date)
        # print(start_date)
        start_date = '.'.join(start_date).strip()
        item['start_date'] = start_date

        # item['deadline']='2019-6'

        ielts = response.xpath(
            '//*[contains(text(),"IELTS")]//text()').extract()
        ielts = get_ielts(ielts)
        # print(ielts)
        try:
            if ielts != [] or ielts != {}:
                item['ielts_l'] = ielts['IELTS_L']
                item['ielts_s'] = ielts['IELTS_S']
                item['ielts_r'] = ielts['IELTS_R']
                item['ielts_w'] = ielts['IELTS_W']
                item['ielts'] = ielts['IELTS']
        except:
            pass

        fee = response.xpath(
            '//dt[contains(text(),"International:")]/following-sibling::dd/text()'
        ).extract()
        tuition_fee = getTuition_fee(fee)
        # print(tuition_fee)
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = '£'

        # print(item)
        yield item
Пример #6
0
    def parse_main(self, response):
        item = get_item1(ScrapyschoolEnglandItem1)
        print(response.url)
        item['teach_time'] = 'fulltime'
        item['university'] = 'University of Glasgow'
        item['url'] = response.url
        item['location'] = 'Glasgow'
        item['start_date'] = '2018-9'
        item['deadline'] = '2018-7'
        item["tuition_fee_pre"] = "£"
        item['teach_type'] = 'taught'

        programme = response.xpath(
            '//div[@id="prog-title"]/h1/text()').extract()
        programme = ''.join(programme)
        item['programme_en'] = programme
        degree_type = response.xpath(
            '//div[@id="prog-title"]/h1/span/text()').extract()
        degree_type = ''.join(degree_type)
        item['degree_name'] = degree_type
        duration = response.xpath(
            '//li[contains(text(),"full-time")]/text()').extract()
        duration = clear_duration(duration)
        item['duration'] = duration['duration']
        item['duration_per'] = duration['duration_per']
        # print(durations)

        overview = response.xpath(
            '//h2[contains(text(),"Why this programme")]/following-sibling::*'
        ).extract()
        overview = remove_class(overview)
        # print(overview)
        item['overview_en'] = overview

        modules = response.xpath(
            '//h2[contains(text(),"Programme str")]/following-sibling::*'
        ).extract()
        modules = clear_same_s(modules)
        modules = remove_class(modules)
        item['modules_en'] = modules
        # print(modules)

        career = response.xpath(
            '//h2[contains(text(),"Career")]/following-sibling::*').extract()
        career = clear_same_s(career)
        career = remove_class(career)
        item['career_en'] = career

        fees = response.xpath(
            '//h2[contains(text(),"Fees and")]/following-sibling::div//text()'
        ).extract()
        fees = response.xpath('//div[@id="fees"]//text()').extract()
        # print(fees)
        tuition_fee = getTuition_fee(fees)
        # print(tuition_fee)
        if tuition_fee == 2018:
            tuition_fee = '0'
        # print(tuition_fee)
        item['tuition_fee'] = tuition_fee

        IELTS = response.xpath(
            '//*[contains(text(),"IELTS")]/../following-sibling::ul[1]//text()'
        ).extract()
        # print(IELTS)
        ielts = get_ielts(IELTS)
        if ielts != {} and ielts != []:
            item['ielts_l'] = ielts['IELTS_L']
            item['ielts_s'] = ielts['IELTS_S']
            item['ielts_r'] = ielts['IELTS_R']
            item['ielts_w'] = ielts['IELTS_W']
            item['ielts'] = ielts['IELTS']
        TOEFL = response.xpath(
            '//*[contains(text(),"TOEFL")]/..//text()').extract()
        # print(TOEFL)
        toefl = get_toefl(TOEFL)
        if toefl != []:
            try:
                item['toefl_r'] = toefl[1]
                item['toefl_l'] = toefl[2]
                item['toefl_s'] = toefl[3]
                item['toefl_w'] = toefl[4]
                item['toefl'] = toefl[0]
            except:
                pass

        entry = response.xpath(
            '//h2[contains(text(),"Entry requirements")]/following-sibling::*'
        ).extract()
        entry = clear_same_s(entry)
        entry = remove_class(entry)
        item['rntry_requirements'] = entry

        apply_d = response.xpath(
            '//h3[contains(text(),"Documents")]/following-sibling::ul[1]'
        ).extract()
        apply_d = clear_same_s(apply_d)
        item['apply_proces_en'] = remove_class(apply_d)

        if programme != '':
            yield item
Пример #7
0
    def parse(self, response):
        pass
        item = get_item1(ScrapyschoolEnglandItem1)

        #1.university
        university = 'Edge Hill University'
        # print(university)

        #2.url
        url = response.url
        # print(url)

        #3.programme_en
        programme_en = response.xpath('//*[@id="primary"]/header/h1/a').extract()
        programme_en = ''.join(programme_en)
        programme_en = remove_tags(programme_en)
        # print(programme_en)

        #4.degree_type
        degree_type = 2

        #5.degree_name
        degree_name = programme_en.split()[0]
        programme_en = programme_en.replace(degree_name,'').strip()
        # print(degree_name)
        # print(programme_en)

        #6.teach_time #7.duration #8.duration_per
        teach_time_list = response.xpath("//*[contains(text(),'Length:')]//following-sibling::*").extract()
        teach_time_list= ''.join(teach_time_list)
        teach_time_list = remove_tags(teach_time_list)
        # print(teach_time_list)
        duration = re.findall('\d+',teach_time_list)[0]
        if 'Months' in teach_time_list:
            duration_per = 3
        elif 'Weeks' in teach_time_list:
            duration_per = 4
        else:
            duration_per = 1
        if 'Full-Time' in teach_time_list:
            teach_time = 'Full-Time'
        else:
            teach_time = 'Part-Time'
        # print(duration,'***********',duration_per)
        # print(teach_time)

        #9.start_date
        start_date = response.xpath("//*[contains(text(),'Dates:')]//following-sibling::*").extract()
        start_date = ''.join(start_date)
        start_date = remove_tags(start_date)
        start_date = tracslateDate(start_date)
        start_date = ','.join(start_date)
        # print(start_date)

        #10.department
        department = response.xpath("//*[contains(text(),'Department:')]//following-sibling::*").extract()
        department = ''.join(department)
        department = remove_tags(department)
        # print(department)

        #11.location
        location = response.xpath("//*[contains(text(),'Location:')]//following-sibling::*").extract()
        location = ''.join(location)
        location = remove_tags(location)
        # print(location)

        #12.overview_en
        overview_en = response.xpath('//*[@id="overview"]/div[1]/div/ul/li/text()').extract()
        overview_en = ''.join(overview_en)
        overview_en = remove_class(overview_en)
        overview_en = '<p>' + overview_en +'</p>'
        # print(overview_en)

        #13.assessment_en
        assessment_en = response.xpath("//*[contains(text(),'How will I be assessed?')]//following-sibling::*[1]").extract()
        assessment_en = ''.join(assessment_en)
        assessment_en = remove_class(assessment_en)
        # print(assessment_en)

        #14.modules_en
        modules_en = response.xpath('//*[@id="modules"]/h4/strong').extract()
        modules_en = ''.join(modules_en)
        modules_en = remove_class(modules_en)
        # print(modules_en)

        #15.rntry_requirements
        rntry_requirements = response.xpath("//*[contains(text(),'Entry Requirements')]//following-sibling::*").extract()
        rntry_requirements = ''.join(rntry_requirements)
        rntry_requirements = remove_class(rntry_requirements)
        # print(rntry_requirements)

        #16.ielts 17.18.19.20
        ielts_list = response.xpath("//*[contains(text(),'English Language Requirements')]//following-sibling::*[1]").extract()
        ielts_list = ''.join(ielts_list)
        ielts_list = remove_tags(ielts_list)
        # print(ielts_list)
        try:
            ielts = re.findall('\d\.\d',ielts_list)
        except:
            ielts = None
        if len(ielts) ==1:
            a = ielts[0]
            ielts = a
            ielts_r = a
            ielts_w = a
            ielts_s = a
            ielts_l = a
        elif len(ielts) ==2:
            a= ielts[0]
            b= ielts[1]
            ielts = a
            ielts_r = b
            ielts_w = b
            ielts_s = b
            ielts_l = b
        else:
            ielts = 6.5
            ielts_r = 6.0
            ielts_w = 6.0
            ielts_s = 6.0
            ielts_l = 6.0
        # print(ielts,ielts_r,ielts_w,ielts_l,ielts_s)

        #21.career_en
        career_en = response.xpath("//*[contains(text(),'What are my career prospects?')]//following-sibling::*").extract()
        career_en = ''.join(career_en)
        career_en = remove_class(career_en)
        # print(career_en)

        #22.tuition_fee
        tuition_fee= response.xpath("//*[contains(text(),'Tuition Fees')]//following-sibling::*").extract()
        tuition_fee = ''.join(tuition_fee)
        tuition_fee = remove_tags(tuition_fee)
        tuition_fee = getTuition_fee(tuition_fee)
        # print(tuition_fee)

        #23.tuition_fee_pre
        tuition_fee_pre= '£'

        #24.apply_proces_en
        apply_proces_en = response.xpath("//h4[contains(text(),'How to Apply')]//following-sibling::*").extract()
        apply_proces_en = ''.join(apply_proces_en)
        apply_proces_en = remove_class(apply_proces_en)
        # print(apply_proces_en)

        #25.apply_pre
        apply_pre = '£'

        item['apply_pre'] = apply_pre
        item['university'] = university
        item['url'] = url
        item['programme_en'] = programme_en
        item['degree_type'] = degree_type
        item['degree_name'] = degree_name
        item['teach_time'] = teach_time
        item['duration'] = duration
        item['duration_per'] = duration_per
        item['start_date'] = start_date
        item['department'] = department
        item['location'] = location
        item['overview_en'] = overview_en
        item['assessment_en'] = assessment_en
        item['modules_en'] = modules_en
        item['rntry_requirements'] = rntry_requirements
        item['ielts'] = ielts
        item['ielts_r'] = ielts_r
        item['ielts_w'] = ielts_w
        item['ielts_l'] = ielts_l
        item['ielts_s'] = ielts_s
        item['career_en'] = career_en
        item['tuition_fee'] = tuition_fee
        item['tuition_fee_pre'] = tuition_fee_pre
        item['apply_proces_en'] = apply_proces_en
        yield  item